youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.dailymotion import DailymotionIE
  26 from .extractor.gametrailers import GametrailersIE
  27 from .extractor.generic import GenericIE
  28 from .extractor.metacafe import MetacafeIE
  29 from .extractor.myvideo import MyVideoIE
  30 from .extractor.statigram import StatigramIE
  31 from .extractor.photobucket import PhotobucketIE
  32 from .extractor.vimeo import VimeoIE
  33 from .extractor.yahoo import YahooIE, YahooSearchIE
  34 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  35 from .extractor.zdf import ZDFIE
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55 class DepositFilesIE(InfoExtractor):
  56     """Information extractor for depositfiles.com"""
  57
  58     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
  59
  60     def _real_extract(self, url):
  61         file_id = url.split('/')[-1]
  62         # Rebuild url in english locale
  63         url = 'http://depositfiles.com/en/files/' + file_id
  64
  65         # Retrieve file webpage with 'Free download' button pressed
  66         free_download_indication = { 'gateway_result' : '1' }
  67         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
  68         try:
  69             self.report_download_webpage(file_id)
  70             webpage = compat_urllib_request.urlopen(request).read()
  71         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  72             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
  73
  74         # Search for the real file URL
  75         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
  76         if (mobj is None) or (mobj.group(1) is None):
  77             # Try to figure out reason of the error.
  78             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
  79             if (mobj is not None) and (mobj.group(1) is not None):
  80                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
  81                 raise ExtractorError(u'%s' % restriction_message)
  82             else:
  83                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
  84
  85         file_url = mobj.group(1)
  86         file_extension = os.path.splitext(file_url)[1][1:]
  87
  88         # Search for file title
  89         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
  90
  91         return [{
  92             'id':       file_id.decode('utf-8'),
  93             'url':      file_url.decode('utf-8'),
  94             'uploader': None,
  95             'upload_date':  None,
  96             'title':    file_title,
  97             'ext':      file_extension.decode('utf-8'),
  98         }]
  99
 100
 101 class FacebookIE(InfoExtractor):
 102     """Information Extractor for Facebook"""
 103
 104     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 105     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 106     _NETRC_MACHINE = 'facebook'
 107     IE_NAME = u'facebook'
 108
 109     def report_login(self):
 110         """Report attempt to log in."""
 111         self.to_screen(u'Logging in')
 112
 113     def _real_initialize(self):
 114         if self._downloader is None:
 115             return
 116
 117         useremail = None
 118         password = None
 119         downloader_params = self._downloader.params
 120
 121         # Attempt to use provided username and password or .netrc data
 122         if downloader_params.get('username', None) is not None:
 123             useremail = downloader_params['username']
 124             password = downloader_params['password']
 125         elif downloader_params.get('usenetrc', False):
 126             try:
 127                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 128                 if info is not None:
 129                     useremail = info[0]
 130                     password = info[2]
 131                 else:
 132                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 133             except (IOError, netrc.NetrcParseError) as err:
 134                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 135                 return
 136
 137         if useremail is None:
 138             return
 139
 140         # Log in
 141         login_form = {
 142             'email': useremail,
 143             'pass': password,
 144             'login': 'Log+In'
 145             }
 146         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 147         try:
 148             self.report_login()
 149             login_results = compat_urllib_request.urlopen(request).read()
 150             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 151                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 152                 return
 153         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 154             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 155             return
 156
 157     def _real_extract(self, url):
 158         mobj = re.match(self._VALID_URL, url)
 159         if mobj is None:
 160             raise ExtractorError(u'Invalid URL: %s' % url)
 161         video_id = mobj.group('ID')
 162
 163         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 164         webpage = self._download_webpage(url, video_id)
 165
 166         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 167         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 168         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 169         if not m:
 170             raise ExtractorError(u'Cannot parse data')
 171         data = dict(json.loads(m.group(1)))
 172         params_raw = compat_urllib_parse.unquote(data['params'])
 173         params = json.loads(params_raw)
 174         video_data = params['video_data'][0]
 175         video_url = video_data.get('hd_src')
 176         if not video_url:
 177             video_url = video_data['sd_src']
 178         if not video_url:
 179             raise ExtractorError(u'Cannot find video URL')
 180         video_duration = int(video_data['video_duration'])
 181         thumbnail = video_data['thumbnail_src']
 182
 183         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 184             webpage, u'title')
 185
 186         info = {
 187             'id': video_id,
 188             'title': video_title,
 189             'url': video_url,
 190             'ext': 'mp4',
 191             'duration': video_duration,
 192             'thumbnail': thumbnail,
 193         }
 194         return [info]
 195
 196
 197
 198
 199
 200
 201
 202 class EscapistIE(InfoExtractor):
 203     """Information extractor for The Escapist """
 204
 205     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 206     IE_NAME = u'escapist'
 207
 208     def _real_extract(self, url):
 209         mobj = re.match(self._VALID_URL, url)
 210         if mobj is None:
 211             raise ExtractorError(u'Invalid URL: %s' % url)
 212         showName = mobj.group('showname')
 213         videoId = mobj.group('episode')
 214
 215         self.report_extraction(videoId)
 216         webpage = self._download_webpage(url, videoId)
 217
 218         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 219             webpage, u'description', fatal=False)
 220
 221         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 222             webpage, u'thumbnail', fatal=False)
 223
 224         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 225             webpage, u'player url')
 226
 227         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 228             webpage, u'player url').split(' : ')[-1]
 229
 230         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 231         configUrl = compat_urllib_parse.unquote(configUrl)
 232
 233         configJSON = self._download_webpage(configUrl, videoId,
 234                                             u'Downloading configuration',
 235                                             u'unable to download configuration')
 236
 237         # Technically, it's JavaScript, not JSON
 238         configJSON = configJSON.replace("'", '"')
 239
 240         try:
 241             config = json.loads(configJSON)
 242         except (ValueError,) as err:
 243             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 244
 245         playlist = config['playlist']
 246         videoUrl = playlist[1]['url']
 247
 248         info = {
 249             'id': videoId,
 250             'url': videoUrl,
 251             'uploader': showName,
 252             'upload_date': None,
 253             'title': title,
 254             'ext': 'mp4',
 255             'thumbnail': imgUrl,
 256             'description': videoDesc,
 257             'player_url': playerUrl,
 258         }
 259
 260         return [info]
 261
 262 class CollegeHumorIE(InfoExtractor):
 263     """Information extractor for collegehumor.com"""
 264
 265     _WORKING = False
 266     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 267     IE_NAME = u'collegehumor'
 268
 269     def report_manifest(self, video_id):
 270         """Report information extraction."""
 271         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 272
 273     def _real_extract(self, url):
 274         mobj = re.match(self._VALID_URL, url)
 275         if mobj is None:
 276             raise ExtractorError(u'Invalid URL: %s' % url)
 277         video_id = mobj.group('videoid')
 278
 279         info = {
 280             'id': video_id,
 281             'uploader': None,
 282             'upload_date': None,
 283         }
 284
 285         self.report_extraction(video_id)
 286         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 287         try:
 288             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 289         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 290             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 291
 292         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 293         try:
 294             videoNode = mdoc.findall('./video')[0]
 295             info['description'] = videoNode.findall('./description')[0].text
 296             info['title'] = videoNode.findall('./caption')[0].text
 297             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 298             manifest_url = videoNode.findall('./file')[0].text
 299         except IndexError:
 300             raise ExtractorError(u'Invalid metadata XML file')
 301
 302         manifest_url += '?hdcore=2.10.3'
 303         self.report_manifest(video_id)
 304         try:
 305             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 306         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 307             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 308
 309         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 310         try:
 311             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 312             node_id = media_node.attrib['url']
 313             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 314         except IndexError as err:
 315             raise ExtractorError(u'Invalid manifest file')
 316
 317         url_pr = compat_urllib_parse_urlparse(manifest_url)
 318         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 319
 320         info['url'] = url
 321         info['ext'] = 'f4f'
 322         return [info]
 323
 324
 325 class XVideosIE(InfoExtractor):
 326     """Information extractor for xvideos.com"""
 327
 328     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 329     IE_NAME = u'xvideos'
 330
 331     def _real_extract(self, url):
 332         mobj = re.match(self._VALID_URL, url)
 333         if mobj is None:
 334             raise ExtractorError(u'Invalid URL: %s' % url)
 335         video_id = mobj.group(1)
 336
 337         webpage = self._download_webpage(url, video_id)
 338
 339         self.report_extraction(video_id)
 340
 341         # Extract video URL
 342         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 343             webpage, u'video URL'))
 344
 345         # Extract title
 346         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 347             webpage, u'title')
 348
 349         # Extract video thumbnail
 350         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 351             webpage, u'thumbnail', fatal=False)
 352
 353         info = {
 354             'id': video_id,
 355             'url': video_url,
 356             'uploader': None,
 357             'upload_date': None,
 358             'title': video_title,
 359             'ext': 'flv',
 360             'thumbnail': video_thumbnail,
 361             'description': None,
 362         }
 363
 364         return [info]
 365
 366
 367 class SoundcloudIE(InfoExtractor):
 368     """Information extractor for soundcloud.com
 369        To access the media, the uid of the song and a stream token
 370        must be extracted from the page source and the script must make
 371        a request to media.soundcloud.com/crossdomain.xml. Then
 372        the media can be grabbed by requesting from an url composed
 373        of the stream token and uid
 374      """
 375
 376     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 377     IE_NAME = u'soundcloud'
 378
 379     def report_resolve(self, video_id):
 380         """Report information extraction."""
 381         self.to_screen(u'%s: Resolving id' % video_id)
 382
 383     def _real_extract(self, url):
 384         mobj = re.match(self._VALID_URL, url)
 385         if mobj is None:
 386             raise ExtractorError(u'Invalid URL: %s' % url)
 387
 388         # extract uploader (which is in the url)
 389         uploader = mobj.group(1)
 390         # extract simple title (uploader + slug of song title)
 391         slug_title =  mobj.group(2)
 392         simple_title = uploader + u'-' + slug_title
 393         full_title = '%s/%s' % (uploader, slug_title)
 394
 395         self.report_resolve(full_title)
 396
 397         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 398         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 399         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 400
 401         info = json.loads(info_json)
 402         video_id = info['id']
 403         self.report_extraction(full_title)
 404
 405         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 406         stream_json = self._download_webpage(streams_url, full_title,
 407                                              u'Downloading stream definitions',
 408                                              u'unable to download stream definitions')
 409
 410         streams = json.loads(stream_json)
 411         mediaURL = streams['http_mp3_128_url']
 412         upload_date = unified_strdate(info['created_at'])
 413
 414         return [{
 415             'id':       info['id'],
 416             'url':      mediaURL,
 417             'uploader': info['user']['username'],
 418             'upload_date': upload_date,
 419             'title':    info['title'],
 420             'ext':      u'mp3',
 421             'description': info['description'],
 422         }]
 423
 424 class SoundcloudSetIE(InfoExtractor):
 425     """Information extractor for soundcloud.com sets
 426        To access the media, the uid of the song and a stream token
 427        must be extracted from the page source and the script must make
 428        a request to media.soundcloud.com/crossdomain.xml. Then
 429        the media can be grabbed by requesting from an url composed
 430        of the stream token and uid
 431      """
 432
 433     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 434     IE_NAME = u'soundcloud:set'
 435
 436     def report_resolve(self, video_id):
 437         """Report information extraction."""
 438         self.to_screen(u'%s: Resolving id' % video_id)
 439
 440     def _real_extract(self, url):
 441         mobj = re.match(self._VALID_URL, url)
 442         if mobj is None:
 443             raise ExtractorError(u'Invalid URL: %s' % url)
 444
 445         # extract uploader (which is in the url)
 446         uploader = mobj.group(1)
 447         # extract simple title (uploader + slug of song title)
 448         slug_title =  mobj.group(2)
 449         simple_title = uploader + u'-' + slug_title
 450         full_title = '%s/sets/%s' % (uploader, slug_title)
 451
 452         self.report_resolve(full_title)
 453
 454         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 455         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 456         info_json = self._download_webpage(resolv_url, full_title)
 457
 458         videos = []
 459         info = json.loads(info_json)
 460         if 'errors' in info:
 461             for err in info['errors']:
 462                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 463             return
 464
 465         self.report_extraction(full_title)
 466         for track in info['tracks']:
 467             video_id = track['id']
 468
 469             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 470             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
 471
 472             self.report_extraction(video_id)
 473             streams = json.loads(stream_json)
 474             mediaURL = streams['http_mp3_128_url']
 475
 476             videos.append({
 477                 'id':       video_id,
 478                 'url':      mediaURL,
 479                 'uploader': track['user']['username'],
 480                 'upload_date':  unified_strdate(track['created_at']),
 481                 'title':    track['title'],
 482                 'ext':      u'mp3',
 483                 'description': track['description'],
 484             })
 485         return videos
 486
 487
 488 class InfoQIE(InfoExtractor):
 489     """Information extractor for infoq.com"""
 490     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 491
 492     def _real_extract(self, url):
 493         mobj = re.match(self._VALID_URL, url)
 494         if mobj is None:
 495             raise ExtractorError(u'Invalid URL: %s' % url)
 496
 497         webpage = self._download_webpage(url, video_id=url)
 498         self.report_extraction(url)
 499
 500         # Extract video URL
 501         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 502         if mobj is None:
 503             raise ExtractorError(u'Unable to extract video url')
 504         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 505         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 506
 507         # Extract title
 508         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 509             webpage, u'title')
 510
 511         # Extract description
 512         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 513             webpage, u'description', fatal=False)
 514
 515         video_filename = video_url.split('/')[-1]
 516         video_id, extension = video_filename.split('.')
 517
 518         info = {
 519             'id': video_id,
 520             'url': video_url,
 521             'uploader': None,
 522             'upload_date': None,
 523             'title': video_title,
 524             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 525             'thumbnail': None,
 526             'description': video_description,
 527         }
 528
 529         return [info]
 530
 531 class MixcloudIE(InfoExtractor):
 532     """Information extractor for www.mixcloud.com"""
 533
 534     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 535     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 536     IE_NAME = u'mixcloud'
 537
 538     def report_download_json(self, file_id):
 539         """Report JSON download."""
 540         self.to_screen(u'Downloading json')
 541
 542     def get_urls(self, jsonData, fmt, bitrate='best'):
 543         """Get urls from 'audio_formats' section in json"""
 544         file_url = None
 545         try:
 546             bitrate_list = jsonData[fmt]
 547             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 548                 bitrate = max(bitrate_list) # select highest
 549
 550             url_list = jsonData[fmt][bitrate]
 551         except TypeError: # we have no bitrate info.
 552             url_list = jsonData[fmt]
 553         return url_list
 554
 555     def check_urls(self, url_list):
 556         """Returns 1st active url from list"""
 557         for url in url_list:
 558             try:
 559                 compat_urllib_request.urlopen(url)
 560                 return url
 561             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 562                 url = None
 563
 564         return None
 565
 566     def _print_formats(self, formats):
 567         print('Available formats:')
 568         for fmt in formats.keys():
 569             for b in formats[fmt]:
 570                 try:
 571                     ext = formats[fmt][b][0]
 572                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 573                 except TypeError: # we have no bitrate info
 574                     ext = formats[fmt][0]
 575                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 576                     break
 577
 578     def _real_extract(self, url):
 579         mobj = re.match(self._VALID_URL, url)
 580         if mobj is None:
 581             raise ExtractorError(u'Invalid URL: %s' % url)
 582         # extract uploader & filename from url
 583         uploader = mobj.group(1).decode('utf-8')
 584         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 585
 586         # construct API request
 587         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 588         # retrieve .json file with links to files
 589         request = compat_urllib_request.Request(file_url)
 590         try:
 591             self.report_download_json(file_url)
 592             jsonData = compat_urllib_request.urlopen(request).read()
 593         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 594             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 595
 596         # parse JSON
 597         json_data = json.loads(jsonData)
 598         player_url = json_data['player_swf_url']
 599         formats = dict(json_data['audio_formats'])
 600
 601         req_format = self._downloader.params.get('format', None)
 602         bitrate = None
 603
 604         if self._downloader.params.get('listformats', None):
 605             self._print_formats(formats)
 606             return
 607
 608         if req_format is None or req_format == 'best':
 609             for format_param in formats.keys():
 610                 url_list = self.get_urls(formats, format_param)
 611                 # check urls
 612                 file_url = self.check_urls(url_list)
 613                 if file_url is not None:
 614                     break # got it!
 615         else:
 616             if req_format not in formats:
 617                 raise ExtractorError(u'Format is not available')
 618
 619             url_list = self.get_urls(formats, req_format)
 620             file_url = self.check_urls(url_list)
 621             format_param = req_format
 622
 623         return [{
 624             'id': file_id.decode('utf-8'),
 625             'url': file_url.decode('utf-8'),
 626             'uploader': uploader.decode('utf-8'),
 627             'upload_date': None,
 628             'title': json_data['name'],
 629             'ext': file_url.split('.')[-1].decode('utf-8'),
 630             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 631             'thumbnail': json_data['thumbnail_url'],
 632             'description': json_data['description'],
 633             'player_url': player_url.decode('utf-8'),
 634         }]
 635
 636 class StanfordOpenClassroomIE(InfoExtractor):
 637     """Information extractor for Stanford's Open ClassRoom"""
 638
 639     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 640     IE_NAME = u'stanfordoc'
 641
 642     def _real_extract(self, url):
 643         mobj = re.match(self._VALID_URL, url)
 644         if mobj is None:
 645             raise ExtractorError(u'Invalid URL: %s' % url)
 646
 647         if mobj.group('course') and mobj.group('video'): # A specific video
 648             course = mobj.group('course')
 649             video = mobj.group('video')
 650             info = {
 651                 'id': course + '_' + video,
 652                 'uploader': None,
 653                 'upload_date': None,
 654             }
 655
 656             self.report_extraction(info['id'])
 657             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 658             xmlUrl = baseUrl + video + '.xml'
 659             try:
 660                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 661             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 662                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 663             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 664             try:
 665                 info['title'] = mdoc.findall('./title')[0].text
 666                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 667             except IndexError:
 668                 raise ExtractorError(u'Invalid metadata XML file')
 669             info['ext'] = info['url'].rpartition('.')[2]
 670             return [info]
 671         elif mobj.group('course'): # A course page
 672             course = mobj.group('course')
 673             info = {
 674                 'id': course,
 675                 'type': 'playlist',
 676                 'uploader': None,
 677                 'upload_date': None,
 678             }
 679
 680             coursepage = self._download_webpage(url, info['id'],
 681                                         note='Downloading course info page',
 682                                         errnote='Unable to download course info page')
 683
 684             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 685
 686             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 687                 coursepage, u'description', fatal=False)
 688
 689             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 690             info['list'] = [
 691                 {
 692                     'type': 'reference',
 693                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 694                 }
 695                     for vpage in links]
 696             results = []
 697             for entry in info['list']:
 698                 assert entry['type'] == 'reference'
 699                 results += self.extract(entry['url'])
 700             return results
 701         else: # Root page
 702             info = {
 703                 'id': 'Stanford OpenClassroom',
 704                 'type': 'playlist',
 705                 'uploader': None,
 706                 'upload_date': None,
 707             }
 708
 709             self.report_download_webpage(info['id'])
 710             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 711             try:
 712                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 714                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 715
 716             info['title'] = info['id']
 717
 718             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 719             info['list'] = [
 720                 {
 721                     'type': 'reference',
 722                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 723                 }
 724                     for cpage in links]
 725
 726             results = []
 727             for entry in info['list']:
 728                 assert entry['type'] == 'reference'
 729                 results += self.extract(entry['url'])
 730             return results
 731
 732 class MTVIE(InfoExtractor):
 733     """Information extractor for MTV.com"""
 734
 735     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 736     IE_NAME = u'mtv'
 737
 738     def _real_extract(self, url):
 739         mobj = re.match(self._VALID_URL, url)
 740         if mobj is None:
 741             raise ExtractorError(u'Invalid URL: %s' % url)
 742         if not mobj.group('proto'):
 743             url = 'http://' + url
 744         video_id = mobj.group('videoid')
 745
 746         webpage = self._download_webpage(url, video_id)
 747
 748         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 749             webpage, u'song name', fatal=False)
 750
 751         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 752             webpage, u'title')
 753
 754         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 755             webpage, u'mtvn_uri', fatal=False)
 756
 757         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 758             webpage, u'content id', fatal=False)
 759
 760         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 761         self.report_extraction(video_id)
 762         request = compat_urllib_request.Request(videogen_url)
 763         try:
 764             metadataXml = compat_urllib_request.urlopen(request).read()
 765         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 766             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 767
 768         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 769         renditions = mdoc.findall('.//rendition')
 770
 771         # For now, always pick the highest quality.
 772         rendition = renditions[-1]
 773
 774         try:
 775             _,_,ext = rendition.attrib['type'].partition('/')
 776             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 777             video_url = rendition.find('./src').text
 778         except KeyError:
 779             raise ExtractorError('Invalid rendition field.')
 780
 781         info = {
 782             'id': video_id,
 783             'url': video_url,
 784             'uploader': performer,
 785             'upload_date': None,
 786             'title': video_title,
 787             'ext': ext,
 788             'format': format,
 789         }
 790
 791         return [info]
 792
 793
 794 class YoukuIE(InfoExtractor):
 795     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 796
 797     def _gen_sid(self):
 798         nowTime = int(time.time() * 1000)
 799         random1 = random.randint(1000,1998)
 800         random2 = random.randint(1000,9999)
 801
 802         return "%d%d%d" %(nowTime,random1,random2)
 803
 804     def _get_file_ID_mix_string(self, seed):
 805         mixed = []
 806         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 807         seed = float(seed)
 808         for i in range(len(source)):
 809             seed  =  (seed * 211 + 30031 ) % 65536
 810             index  =  math.floor(seed / 65536 * len(source) )
 811             mixed.append(source[int(index)])
 812             source.remove(source[int(index)])
 813         #return ''.join(mixed)
 814         return mixed
 815
 816     def _get_file_id(self, fileId, seed):
 817         mixed = self._get_file_ID_mix_string(seed)
 818         ids = fileId.split('*')
 819         realId = []
 820         for ch in ids:
 821             if ch:
 822                 realId.append(mixed[int(ch)])
 823         return ''.join(realId)
 824
 825     def _real_extract(self, url):
 826         mobj = re.match(self._VALID_URL, url)
 827         if mobj is None:
 828             raise ExtractorError(u'Invalid URL: %s' % url)
 829         video_id = mobj.group('ID')
 830
 831         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 832
 833         jsondata = self._download_webpage(info_url, video_id)
 834
 835         self.report_extraction(video_id)
 836         try:
 837             config = json.loads(jsondata)
 838
 839             video_title =  config['data'][0]['title']
 840             seed = config['data'][0]['seed']
 841
 842             format = self._downloader.params.get('format', None)
 843             supported_format = list(config['data'][0]['streamfileids'].keys())
 844
 845             if format is None or format == 'best':
 846                 if 'hd2' in supported_format:
 847                     format = 'hd2'
 848                 else:
 849                     format = 'flv'
 850                 ext = u'flv'
 851             elif format == 'worst':
 852                 format = 'mp4'
 853                 ext = u'mp4'
 854             else:
 855                 format = 'flv'
 856                 ext = u'flv'
 857
 858
 859             fileid = config['data'][0]['streamfileids'][format]
 860             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 861         except (UnicodeDecodeError, ValueError, KeyError):
 862             raise ExtractorError(u'Unable to extract info section')
 863
 864         files_info=[]
 865         sid = self._gen_sid()
 866         fileid = self._get_file_id(fileid, seed)
 867
 868         #column 8,9 of fileid represent the segment number
 869         #fileid[7:9] should be changed
 870         for index, key in enumerate(keys):
 871
 872             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 873             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 874
 875             info = {
 876                 'id': '%s_part%02d' % (video_id, index),
 877                 'url': download_url,
 878                 'uploader': None,
 879                 'upload_date': None,
 880                 'title': video_title,
 881                 'ext': ext,
 882             }
 883             files_info.append(info)
 884
 885         return files_info
 886
 887
 888 class XNXXIE(InfoExtractor):
 889     """Information extractor for xnxx.com"""
 890
 891     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 892     IE_NAME = u'xnxx'
 893     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 894     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 895     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 896
 897     def _real_extract(self, url):
 898         mobj = re.match(self._VALID_URL, url)
 899         if mobj is None:
 900             raise ExtractorError(u'Invalid URL: %s' % url)
 901         video_id = mobj.group(1)
 902
 903         # Get webpage content
 904         webpage = self._download_webpage(url, video_id)
 905
 906         video_url = self._search_regex(self.VIDEO_URL_RE,
 907             webpage, u'video URL')
 908         video_url = compat_urllib_parse.unquote(video_url)
 909
 910         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 911             webpage, u'title')
 912
 913         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 914             webpage, u'thumbnail', fatal=False)
 915
 916         return [{
 917             'id': video_id,
 918             'url': video_url,
 919             'uploader': None,
 920             'upload_date': None,
 921             'title': video_title,
 922             'ext': 'flv',
 923             'thumbnail': video_thumbnail,
 924             'description': None,
 925         }]
 926
 927
 928 class GooglePlusIE(InfoExtractor):
 929     """Information extractor for plus.google.com."""
 930
 931     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
 932     IE_NAME = u'plus.google'
 933
 934     def _real_extract(self, url):
 935         # Extract id from URL
 936         mobj = re.match(self._VALID_URL, url)
 937         if mobj is None:
 938             raise ExtractorError(u'Invalid URL: %s' % url)
 939
 940         post_url = mobj.group(0)
 941         video_id = mobj.group(1)
 942
 943         video_extension = 'flv'
 944
 945         # Step 1, Retrieve post webpage to extract further information
 946         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
 947
 948         self.report_extraction(video_id)
 949
 950         # Extract update date
 951         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
 952             webpage, u'upload date', fatal=False)
 953         if upload_date:
 954             # Convert timestring to a format suitable for filename
 955             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
 956             upload_date = upload_date.strftime('%Y%m%d')
 957
 958         # Extract uploader
 959         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
 960             webpage, u'uploader', fatal=False)
 961
 962         # Extract title
 963         # Get the first line for title
 964         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
 965             webpage, 'title', default=u'NA')
 966
 967         # Step 2, Stimulate clicking the image box to launch video
 968         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
 969             webpage, u'video page URL')
 970         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
 971
 972         # Extract video links on video page
 973         """Extract video links of all sizes"""
 974         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
 975         mobj = re.findall(pattern, webpage)
 976         if len(mobj) == 0:
 977             raise ExtractorError(u'Unable to extract video links')
 978
 979         # Sort in resolution
 980         links = sorted(mobj)
 981
 982         # Choose the lowest of the sort, i.e. highest resolution
 983         video_url = links[-1]
 984         # Only get the url. The resolution part in the tuple has no use anymore
 985         video_url = video_url[-1]
 986         # Treat escaped \u0026 style hex
 987         try:
 988             video_url = video_url.decode("unicode_escape")
 989         except AttributeError: # Python 3
 990             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
 991
 992
 993         return [{
 994             'id':       video_id,
 995             'url':      video_url,
 996             'uploader': uploader,
 997             'upload_date':  upload_date,
 998             'title':    video_title,
 999             'ext':      video_extension,
1000         }]
1001
1002 class NBAIE(InfoExtractor):
1003     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1004     IE_NAME = u'nba'
1005
1006     def _real_extract(self, url):
1007         mobj = re.match(self._VALID_URL, url)
1008         if mobj is None:
1009             raise ExtractorError(u'Invalid URL: %s' % url)
1010
1011         video_id = mobj.group(1)
1012
1013         webpage = self._download_webpage(url, video_id)
1014
1015         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1016
1017         shortened_video_id = video_id.rpartition('/')[2]
1018         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1019             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1020
1021         # It isn't there in the HTML it returns to us
1022         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1023
1024         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1025
1026         info = {
1027             'id': shortened_video_id,
1028             'url': video_url,
1029             'ext': 'mp4',
1030             'title': title,
1031             # 'uploader_date': uploader_date,
1032             'description': description,
1033         }
1034         return [info]
1035
1036 class JustinTVIE(InfoExtractor):
1037     """Information extractor for justin.tv and twitch.tv"""
1038     # TODO: One broadcast may be split into multiple videos. The key
1039     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1040     # starts at 1 and increases. Can we treat all parts as one video?
1041
1042     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1043         (?:
1044             (?P<channelid>[^/]+)|
1045             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1046             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1047         )
1048         /?(?:\#.*)?$
1049         """
1050     _JUSTIN_PAGE_LIMIT = 100
1051     IE_NAME = u'justin.tv'
1052
1053     def report_download_page(self, channel, offset):
1054         """Report attempt to download a single page of videos."""
1055         self.to_screen(u'%s: Downloading video information from %d to %d' %
1056                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1057
1058     # Return count of items, list of *valid* items
1059     def _parse_page(self, url, video_id):
1060         webpage = self._download_webpage(url, video_id,
1061                                          u'Downloading video info JSON',
1062                                          u'unable to download video info JSON')
1063
1064         response = json.loads(webpage)
1065         if type(response) != list:
1066             error_text = response.get('error', 'unknown error')
1067             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1068         info = []
1069         for clip in response:
1070             video_url = clip['video_file_url']
1071             if video_url:
1072                 video_extension = os.path.splitext(video_url)[1][1:]
1073                 video_date = re.sub('-', '', clip['start_time'][:10])
1074                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1075                 video_id = clip['id']
1076                 video_title = clip.get('title', video_id)
1077                 info.append({
1078                     'id': video_id,
1079                     'url': video_url,
1080                     'title': video_title,
1081                     'uploader': clip.get('channel_name', video_uploader_id),
1082                     'uploader_id': video_uploader_id,
1083                     'upload_date': video_date,
1084                     'ext': video_extension,
1085                 })
1086         return (len(response), info)
1087
1088     def _real_extract(self, url):
1089         mobj = re.match(self._VALID_URL, url)
1090         if mobj is None:
1091             raise ExtractorError(u'invalid URL: %s' % url)
1092
1093         api_base = 'http://api.justin.tv'
1094         paged = False
1095         if mobj.group('channelid'):
1096             paged = True
1097             video_id = mobj.group('channelid')
1098             api = api_base + '/channel/archives/%s.json' % video_id
1099         elif mobj.group('chapterid'):
1100             chapter_id = mobj.group('chapterid')
1101
1102             webpage = self._download_webpage(url, chapter_id)
1103             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1104             if not m:
1105                 raise ExtractorError(u'Cannot find archive of a chapter')
1106             archive_id = m.group(1)
1107
1108             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1109             chapter_info_xml = self._download_webpage(api, chapter_id,
1110                                              note=u'Downloading chapter information',
1111                                              errnote=u'Chapter information download failed')
1112             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1113             for a in doc.findall('.//archive'):
1114                 if archive_id == a.find('./id').text:
1115                     break
1116             else:
1117                 raise ExtractorError(u'Could not find chapter in chapter information')
1118
1119             video_url = a.find('./video_file_url').text
1120             video_ext = video_url.rpartition('.')[2] or u'flv'
1121
1122             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1123             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1124                                    note='Downloading chapter metadata',
1125                                    errnote='Download of chapter metadata failed')
1126             chapter_info = json.loads(chapter_info_json)
1127
1128             bracket_start = int(doc.find('.//bracket_start').text)
1129             bracket_end = int(doc.find('.//bracket_end').text)
1130
1131             # TODO determine start (and probably fix up file)
1132             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1133             #video_url += u'?start=' + TODO:start_timestamp
1134             # bracket_start is 13290, but we want 51670615
1135             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1136                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1137
1138             info = {
1139                 'id': u'c' + chapter_id,
1140                 'url': video_url,
1141                 'ext': video_ext,
1142                 'title': chapter_info['title'],
1143                 'thumbnail': chapter_info['preview'],
1144                 'description': chapter_info['description'],
1145                 'uploader': chapter_info['channel']['display_name'],
1146                 'uploader_id': chapter_info['channel']['name'],
1147             }
1148             return [info]
1149         else:
1150             video_id = mobj.group('videoid')
1151             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1152
1153         self.report_extraction(video_id)
1154
1155         info = []
1156         offset = 0
1157         limit = self._JUSTIN_PAGE_LIMIT
1158         while True:
1159             if paged:
1160                 self.report_download_page(video_id, offset)
1161             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1162             page_count, page_info = self._parse_page(page_url, video_id)
1163             info.extend(page_info)
1164             if not paged or page_count != limit:
1165                 break
1166             offset += limit
1167         return info
1168
1169 class FunnyOrDieIE(InfoExtractor):
1170     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1171
1172     def _real_extract(self, url):
1173         mobj = re.match(self._VALID_URL, url)
1174         if mobj is None:
1175             raise ExtractorError(u'invalid URL: %s' % url)
1176
1177         video_id = mobj.group('id')
1178         webpage = self._download_webpage(url, video_id)
1179
1180         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1181             webpage, u'video URL', flags=re.DOTALL)
1182
1183         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1184             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1185
1186         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1187             webpage, u'description', fatal=False, flags=re.DOTALL)
1188
1189         info = {
1190             'id': video_id,
1191             'url': video_url,
1192             'ext': 'mp4',
1193             'title': title,
1194             'description': video_description,
1195         }
1196         return [info]
1197
1198 class SteamIE(InfoExtractor):
1199     _VALID_URL = r"""http://store\.steampowered\.com/
1200                 (agecheck/)?
1201                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1202                 (?P<gameID>\d+)/?
1203                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1204                 """
1205     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1206     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1207
1208     @classmethod
1209     def suitable(cls, url):
1210         """Receives a URL and returns True if suitable for this IE."""
1211         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1212
1213     def _real_extract(self, url):
1214         m = re.match(self._VALID_URL, url, re.VERBOSE)
1215         gameID = m.group('gameID')
1216
1217         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1218         webpage = self._download_webpage(videourl, gameID)
1219
1220         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1221             videourl = self._AGECHECK_TEMPLATE % gameID
1222             self.report_age_confirmation()
1223             webpage = self._download_webpage(videourl, gameID)
1224
1225         self.report_extraction(gameID)
1226         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1227                                              webpage, 'game title')
1228
1229         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1230         mweb = re.finditer(urlRE, webpage)
1231         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1232         titles = re.finditer(namesRE, webpage)
1233         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1234         thumbs = re.finditer(thumbsRE, webpage)
1235         videos = []
1236         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1237             video_id = vid.group('videoID')
1238             title = vtitle.group('videoName')
1239             video_url = vid.group('videoURL')
1240             video_thumb = thumb.group('thumbnail')
1241             if not video_url:
1242                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1243             info = {
1244                 'id':video_id,
1245                 'url':video_url,
1246                 'ext': 'flv',
1247                 'title': unescapeHTML(title),
1248                 'thumbnail': video_thumb
1249                   }
1250             videos.append(info)
1251         return [self.playlist_result(videos, gameID, game_title)]
1252
1253 class UstreamIE(InfoExtractor):
1254     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1255     IE_NAME = u'ustream'
1256
1257     def _real_extract(self, url):
1258         m = re.match(self._VALID_URL, url)
1259         video_id = m.group('videoID')
1260
1261         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1262         webpage = self._download_webpage(url, video_id)
1263
1264         self.report_extraction(video_id)
1265
1266         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1267             webpage, u'title')
1268
1269         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1270             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1271
1272         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1273             webpage, u'thumbnail', fatal=False)
1274
1275         info = {
1276                 'id': video_id,
1277                 'url': video_url,
1278                 'ext': 'flv',
1279                 'title': video_title,
1280                 'uploader': uploader,
1281                 'thumbnail': thumbnail,
1282                }
1283         return info
1284
1285 class WorldStarHipHopIE(InfoExtractor):
1286     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1287     IE_NAME = u'WorldStarHipHop'
1288
1289     def _real_extract(self, url):
1290         m = re.match(self._VALID_URL, url)
1291         video_id = m.group('id')
1292
1293         webpage_src = self._download_webpage(url, video_id)
1294
1295         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1296             webpage_src, u'video URL')
1297
1298         if 'mp4' in video_url:
1299             ext = 'mp4'
1300         else:
1301             ext = 'flv'
1302
1303         video_title = self._html_search_regex(r"<title>(.*)</title>",
1304             webpage_src, u'title')
1305
1306         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1307         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1308             webpage_src, u'thumbnail', fatal=False)
1309
1310         if not thumbnail:
1311             _title = r"""candytitles.*>(.*)</span>"""
1312             mobj = re.search(_title, webpage_src)
1313             if mobj is not None:
1314                 video_title = mobj.group(1)
1315
1316         results = [{
1317                     'id': video_id,
1318                     'url' : video_url,
1319                     'title' : video_title,
1320                     'thumbnail' : thumbnail,
1321                     'ext' : ext,
1322                     }]
1323         return results
1324
1325 class RBMARadioIE(InfoExtractor):
1326     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1327
1328     def _real_extract(self, url):
1329         m = re.match(self._VALID_URL, url)
1330         video_id = m.group('videoID')
1331
1332         webpage = self._download_webpage(url, video_id)
1333
1334         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1335             webpage, u'json data', flags=re.MULTILINE)
1336
1337         try:
1338             data = json.loads(json_data)
1339         except ValueError as e:
1340             raise ExtractorError(u'Invalid JSON: ' + str(e))
1341
1342         video_url = data['akamai_url'] + '&cbr=256'
1343         url_parts = compat_urllib_parse_urlparse(video_url)
1344         video_ext = url_parts.path.rpartition('.')[2]
1345         info = {
1346                 'id': video_id,
1347                 'url': video_url,
1348                 'ext': video_ext,
1349                 'title': data['title'],
1350                 'description': data.get('teaser_text'),
1351                 'location': data.get('country_of_origin'),
1352                 'uploader': data.get('host', {}).get('name'),
1353                 'uploader_id': data.get('host', {}).get('slug'),
1354                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1355                 'duration': data.get('duration'),
1356         }
1357         return [info]
1358
1359
1360 class YouPornIE(InfoExtractor):
1361     """Information extractor for youporn.com."""
1362     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1363
1364     def _print_formats(self, formats):
1365         """Print all available formats"""
1366         print(u'Available formats:')
1367         print(u'ext\t\tformat')
1368         print(u'---------------------------------')
1369         for format in formats:
1370             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1371
1372     def _specific(self, req_format, formats):
1373         for x in formats:
1374             if(x["format"]==req_format):
1375                 return x
1376         return None
1377
1378     def _real_extract(self, url):
1379         mobj = re.match(self._VALID_URL, url)
1380         if mobj is None:
1381             raise ExtractorError(u'Invalid URL: %s' % url)
1382         video_id = mobj.group('videoid')
1383
1384         req = compat_urllib_request.Request(url)
1385         req.add_header('Cookie', 'age_verified=1')
1386         webpage = self._download_webpage(req, video_id)
1387
1388         # Get JSON parameters
1389         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1390         try:
1391             params = json.loads(json_params)
1392         except:
1393             raise ExtractorError(u'Invalid JSON')
1394
1395         self.report_extraction(video_id)
1396         try:
1397             video_title = params['title']
1398             upload_date = unified_strdate(params['release_date_f'])
1399             video_description = params['description']
1400             video_uploader = params['submitted_by']
1401             thumbnail = params['thumbnails'][0]['image']
1402         except KeyError:
1403             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1404
1405         # Get all of the formats available
1406         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1407         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1408             webpage, u'download list').strip()
1409
1410         # Get all of the links from the page
1411         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1412         links = re.findall(LINK_RE, download_list_html)
1413         if(len(links) == 0):
1414             raise ExtractorError(u'ERROR: no known formats available for video')
1415
1416         self.to_screen(u'Links found: %d' % len(links))
1417
1418         formats = []
1419         for link in links:
1420
1421             # A link looks like this:
1422             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1423             # A path looks like this:
1424             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1425             video_url = unescapeHTML( link )
1426             path = compat_urllib_parse_urlparse( video_url ).path
1427             extension = os.path.splitext( path )[1][1:]
1428             format = path.split('/')[4].split('_')[:2]
1429             size = format[0]
1430             bitrate = format[1]
1431             format = "-".join( format )
1432             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1433
1434             formats.append({
1435                 'id': video_id,
1436                 'url': video_url,
1437                 'uploader': video_uploader,
1438                 'upload_date': upload_date,
1439                 'title': video_title,
1440                 'ext': extension,
1441                 'format': format,
1442                 'thumbnail': thumbnail,
1443                 'description': video_description
1444             })
1445
1446         if self._downloader.params.get('listformats', None):
1447             self._print_formats(formats)
1448             return
1449
1450         req_format = self._downloader.params.get('format', None)
1451         self.to_screen(u'Format: %s' % req_format)
1452
1453         if req_format is None or req_format == 'best':
1454             return [formats[0]]
1455         elif req_format == 'worst':
1456             return [formats[-1]]
1457         elif req_format in ('-1', 'all'):
1458             return formats
1459         else:
1460             format = self._specific( req_format, formats )
1461             if result is None:
1462                 raise ExtractorError(u'Requested format not available')
1463             return [format]
1464
1465
1466
1467 class PornotubeIE(InfoExtractor):
1468     """Information extractor for pornotube.com."""
1469     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1470
1471     def _real_extract(self, url):
1472         mobj = re.match(self._VALID_URL, url)
1473         if mobj is None:
1474             raise ExtractorError(u'Invalid URL: %s' % url)
1475
1476         video_id = mobj.group('videoid')
1477         video_title = mobj.group('title')
1478
1479         # Get webpage content
1480         webpage = self._download_webpage(url, video_id)
1481
1482         # Get the video URL
1483         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1484         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1485         video_url = compat_urllib_parse.unquote(video_url)
1486
1487         #Get the uploaded date
1488         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1489         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1490         if upload_date: upload_date = unified_strdate(upload_date)
1491
1492         info = {'id': video_id,
1493                 'url': video_url,
1494                 'uploader': None,
1495                 'upload_date': upload_date,
1496                 'title': video_title,
1497                 'ext': 'flv',
1498                 'format': 'flv'}
1499
1500         return [info]
1501
1502 class YouJizzIE(InfoExtractor):
1503     """Information extractor for youjizz.com."""
1504     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1505
1506     def _real_extract(self, url):
1507         mobj = re.match(self._VALID_URL, url)
1508         if mobj is None:
1509             raise ExtractorError(u'Invalid URL: %s' % url)
1510
1511         video_id = mobj.group('videoid')
1512
1513         # Get webpage content
1514         webpage = self._download_webpage(url, video_id)
1515
1516         # Get the video title
1517         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1518             webpage, u'title').strip()
1519
1520         # Get the embed page
1521         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1522         if result is None:
1523             raise ExtractorError(u'ERROR: unable to extract embed page')
1524
1525         embed_page_url = result.group(0).strip()
1526         video_id = result.group('videoid')
1527
1528         webpage = self._download_webpage(embed_page_url, video_id)
1529
1530         # Get the video URL
1531         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1532             webpage, u'video URL')
1533
1534         info = {'id': video_id,
1535                 'url': video_url,
1536                 'title': video_title,
1537                 'ext': 'flv',
1538                 'format': 'flv',
1539                 'player_url': embed_page_url}
1540
1541         return [info]
1542
1543 class EightTracksIE(InfoExtractor):
1544     IE_NAME = '8tracks'
1545     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1546
1547     def _real_extract(self, url):
1548         mobj = re.match(self._VALID_URL, url)
1549         if mobj is None:
1550             raise ExtractorError(u'Invalid URL: %s' % url)
1551         playlist_id = mobj.group('id')
1552
1553         webpage = self._download_webpage(url, playlist_id)
1554
1555         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1556         data = json.loads(json_like)
1557
1558         session = str(random.randint(0, 1000000000))
1559         mix_id = data['id']
1560         track_count = data['tracks_count']
1561         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1562         next_url = first_url
1563         res = []
1564         for i in itertools.count():
1565             api_json = self._download_webpage(next_url, playlist_id,
1566                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1567                 errnote=u'Failed to download song information')
1568             api_data = json.loads(api_json)
1569             track_data = api_data[u'set']['track']
1570             info = {
1571                 'id': track_data['id'],
1572                 'url': track_data['track_file_stream_url'],
1573                 'title': track_data['performer'] + u' - ' + track_data['name'],
1574                 'raw_title': track_data['name'],
1575                 'uploader_id': data['user']['login'],
1576                 'ext': 'm4a',
1577             }
1578             res.append(info)
1579             if api_data['set']['at_last_track']:
1580                 break
1581             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1582         return res
1583
1584 class KeekIE(InfoExtractor):
1585     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1586     IE_NAME = u'keek'
1587
1588     def _real_extract(self, url):
1589         m = re.match(self._VALID_URL, url)
1590         video_id = m.group('videoID')
1591
1592         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1593         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1594         webpage = self._download_webpage(url, video_id)
1595
1596         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1597             webpage, u'title')
1598
1599         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1600             webpage, u'uploader', fatal=False)
1601
1602         info = {
1603                 'id': video_id,
1604                 'url': video_url,
1605                 'ext': 'mp4',
1606                 'title': video_title,
1607                 'thumbnail': thumbnail,
1608                 'uploader': uploader
1609         }
1610         return [info]
1611
1612 class TEDIE(InfoExtractor):
1613     _VALID_URL=r'''http://www\.ted\.com/
1614                    (
1615                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1616                         |
1617                         ((?P<type_talk>talks)) # We have a simple talk
1618                    )
1619                    (/lang/(.*?))? # The url may contain the language
1620                    /(?P<name>\w+) # Here goes the name and then ".html"
1621                    '''
1622
1623     @classmethod
1624     def suitable(cls, url):
1625         """Receives a URL and returns True if suitable for this IE."""
1626         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1627
1628     def _real_extract(self, url):
1629         m=re.match(self._VALID_URL, url, re.VERBOSE)
1630         if m.group('type_talk'):
1631             return [self._talk_info(url)]
1632         else :
1633             playlist_id=m.group('playlist_id')
1634             name=m.group('name')
1635             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1636             return [self._playlist_videos_info(url,name,playlist_id)]
1637
1638     def _playlist_videos_info(self,url,name,playlist_id=0):
1639         '''Returns the videos of the playlist'''
1640         video_RE=r'''
1641                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1642                      ([.\s]*?)data-playlist_item_id="(\d+)"
1643                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1644                      '''
1645         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1646         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1647         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1648         m_names=re.finditer(video_name_RE,webpage)
1649
1650         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1651                                                  webpage, 'playlist title')
1652
1653         playlist_entries = []
1654         for m_video, m_name in zip(m_videos,m_names):
1655             video_id=m_video.group('video_id')
1656             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1657             playlist_entries.append(self.url_result(talk_url, 'TED'))
1658         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1659
1660     def _talk_info(self, url, video_id=0):
1661         """Return the video for the talk in the url"""
1662         m = re.match(self._VALID_URL, url,re.VERBOSE)
1663         video_name = m.group('name')
1664         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1665         self.report_extraction(video_name)
1666         # If the url includes the language we get the title translated
1667         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1668                                         webpage, 'title')
1669         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1670                                     webpage, 'json data')
1671         info = json.loads(json_data)
1672         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1673                                        webpage, 'description', flags = re.DOTALL)
1674
1675         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1676                                        webpage, 'thumbnail')
1677         info = {
1678                 'id': info['id'],
1679                 'url': info['htmlStreams'][-1]['file'],
1680                 'ext': 'mp4',
1681                 'title': title,
1682                 'thumbnail': thumbnail,
1683                 'description': desc,
1684                 }
1685         return info
1686
1687 class MySpassIE(InfoExtractor):
1688     _VALID_URL = r'http://www.myspass.de/.*'
1689
1690     def _real_extract(self, url):
1691         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1692
1693         # video id is the last path element of the URL
1694         # usually there is a trailing slash, so also try the second but last
1695         url_path = compat_urllib_parse_urlparse(url).path
1696         url_parent_path, video_id = os.path.split(url_path)
1697         if not video_id:
1698             _, video_id = os.path.split(url_parent_path)
1699
1700         # get metadata
1701         metadata_url = META_DATA_URL_TEMPLATE % video_id
1702         metadata_text = self._download_webpage(metadata_url, video_id)
1703         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1704
1705         # extract values from metadata
1706         url_flv_el = metadata.find('url_flv')
1707         if url_flv_el is None:
1708             raise ExtractorError(u'Unable to extract download url')
1709         video_url = url_flv_el.text
1710         extension = os.path.splitext(video_url)[1][1:]
1711         title_el = metadata.find('title')
1712         if title_el is None:
1713             raise ExtractorError(u'Unable to extract title')
1714         title = title_el.text
1715         format_id_el = metadata.find('format_id')
1716         if format_id_el is None:
1717             format = ext
1718         else:
1719             format = format_id_el.text
1720         description_el = metadata.find('description')
1721         if description_el is not None:
1722             description = description_el.text
1723         else:
1724             description = None
1725         imagePreview_el = metadata.find('imagePreview')
1726         if imagePreview_el is not None:
1727             thumbnail = imagePreview_el.text
1728         else:
1729             thumbnail = None
1730         info = {
1731             'id': video_id,
1732             'url': video_url,
1733             'title': title,
1734             'ext': extension,
1735             'format': format,
1736             'thumbnail': thumbnail,
1737             'description': description
1738         }
1739         return [info]
1740
1741 class SpiegelIE(InfoExtractor):
1742     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1743
1744     def _real_extract(self, url):
1745         m = re.match(self._VALID_URL, url)
1746         video_id = m.group('videoID')
1747
1748         webpage = self._download_webpage(url, video_id)
1749
1750         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1751             webpage, u'title')
1752
1753         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1754         xml_code = self._download_webpage(xml_url, video_id,
1755                     note=u'Downloading XML', errnote=u'Failed to download XML')
1756
1757         idoc = xml.etree.ElementTree.fromstring(xml_code)
1758         last_type = idoc[-1]
1759         filename = last_type.findall('./filename')[0].text
1760         duration = float(last_type.findall('./duration')[0].text)
1761
1762         video_url = 'http://video2.spiegel.de/flash/' + filename
1763         video_ext = filename.rpartition('.')[2]
1764         info = {
1765             'id': video_id,
1766             'url': video_url,
1767             'ext': video_ext,
1768             'title': video_title,
1769             'duration': duration,
1770         }
1771         return [info]
1772
1773 class LiveLeakIE(InfoExtractor):
1774
1775     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1776     IE_NAME = u'liveleak'
1777
1778     def _real_extract(self, url):
1779         mobj = re.match(self._VALID_URL, url)
1780         if mobj is None:
1781             raise ExtractorError(u'Invalid URL: %s' % url)
1782
1783         video_id = mobj.group('video_id')
1784
1785         webpage = self._download_webpage(url, video_id)
1786
1787         video_url = self._search_regex(r'file: "(.*?)",',
1788             webpage, u'video URL')
1789
1790         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1791             webpage, u'title').replace('LiveLeak.com -', '').strip()
1792
1793         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1794             webpage, u'description', fatal=False)
1795
1796         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1797             webpage, u'uploader', fatal=False)
1798
1799         info = {
1800             'id':  video_id,
1801             'url': video_url,
1802             'ext': 'mp4',
1803             'title': video_title,
1804             'description': video_description,
1805             'uploader': video_uploader
1806         }
1807
1808         return [info]
1809
1810
1811
1812 class TumblrIE(InfoExtractor):
1813     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1814
1815     def _real_extract(self, url):
1816         m_url = re.match(self._VALID_URL, url)
1817         video_id = m_url.group('id')
1818         blog = m_url.group('blog_name')
1819
1820         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1821         webpage = self._download_webpage(url, video_id)
1822
1823         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1824         video = re.search(re_video, webpage)
1825         if video is None:
1826            raise ExtractorError(u'Unable to extract video')
1827         video_url = video.group('video_url')
1828         ext = video.group('ext')
1829
1830         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1831             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1832         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1833
1834         # The only place where you can get a title, it's not complete,
1835         # but searching in other places doesn't work for all videos
1836         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1837             webpage, u'title', flags=re.DOTALL)
1838
1839         return [{'id': video_id,
1840                  'url': video_url,
1841                  'title': video_title,
1842                  'thumbnail': video_thumbnail,
1843                  'ext': ext
1844                  }]
1845
1846 class BandcampIE(InfoExtractor):
1847     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1848
1849     def _real_extract(self, url):
1850         mobj = re.match(self._VALID_URL, url)
1851         title = mobj.group('title')
1852         webpage = self._download_webpage(url, title)
1853         # We get the link to the free download page
1854         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1855         if m_download is None:
1856             raise ExtractorError(u'No free songs found')
1857
1858         download_link = m_download.group(1)
1859         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1860                        webpage, re.MULTILINE|re.DOTALL).group('id')
1861
1862         download_webpage = self._download_webpage(download_link, id,
1863                                                   'Downloading free downloads page')
1864         # We get the dictionary of the track from some javascrip code
1865         info = re.search(r'items: (.*?),$',
1866                          download_webpage, re.MULTILINE).group(1)
1867         info = json.loads(info)[0]
1868         # We pick mp3-320 for now, until format selection can be easily implemented.
1869         mp3_info = info[u'downloads'][u'mp3-320']
1870         # If we try to use this url it says the link has expired
1871         initial_url = mp3_info[u'url']
1872         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1873         m_url = re.match(re_url, initial_url)
1874         #We build the url we will use to get the final track url
1875         # This url is build in Bandcamp in the script download_bunde_*.js
1876         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1877         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1878         # If we could correctly generate the .rand field the url would be
1879         #in the "download_url" key
1880         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1881
1882         track_info = {'id':id,
1883                       'title' : info[u'title'],
1884                       'ext' :   'mp3',
1885                       'url' :   final_url,
1886                       'thumbnail' : info[u'thumb_url'],
1887                       'uploader' :  info[u'artist']
1888                       }
1889
1890         return [track_info]
1891
1892 class RedTubeIE(InfoExtractor):
1893     """Information Extractor for redtube"""
1894     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1895
1896     def _real_extract(self,url):
1897         mobj = re.match(self._VALID_URL, url)
1898         if mobj is None:
1899             raise ExtractorError(u'Invalid URL: %s' % url)
1900
1901         video_id = mobj.group('id')
1902         video_extension = 'mp4'
1903         webpage = self._download_webpage(url, video_id)
1904
1905         self.report_extraction(video_id)
1906
1907         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1908             webpage, u'video URL')
1909
1910         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1911             webpage, u'title')
1912
1913         return [{
1914             'id':       video_id,
1915             'url':      video_url,
1916             'ext':      video_extension,
1917             'title':    video_title,
1918         }]
1919
1920 class InaIE(InfoExtractor):
1921     """Information Extractor for Ina.fr"""
1922     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1923
1924     def _real_extract(self,url):
1925         mobj = re.match(self._VALID_URL, url)
1926
1927         video_id = mobj.group('id')
1928         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1929         video_extension = 'mp4'
1930         webpage = self._download_webpage(mrss_url, video_id)
1931
1932         self.report_extraction(video_id)
1933
1934         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1935             webpage, u'video URL')
1936
1937         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1938             webpage, u'title')
1939
1940         return [{
1941             'id':       video_id,
1942             'url':      video_url,
1943             'ext':      video_extension,
1944             'title':    video_title,
1945         }]
1946
1947 class HowcastIE(InfoExtractor):
1948     """Information Extractor for Howcast.com"""
1949     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1950
1951     def _real_extract(self, url):
1952         mobj = re.match(self._VALID_URL, url)
1953
1954         video_id = mobj.group('id')
1955         webpage_url = 'http://www.howcast.com/videos/' + video_id
1956         webpage = self._download_webpage(webpage_url, video_id)
1957
1958         self.report_extraction(video_id)
1959
1960         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1961             webpage, u'video URL')
1962
1963         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1964             webpage, u'title')
1965
1966         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1967             webpage, u'description', fatal=False)
1968
1969         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1970             webpage, u'thumbnail', fatal=False)
1971
1972         return [{
1973             'id':       video_id,
1974             'url':      video_url,
1975             'ext':      'mp4',
1976             'title':    video_title,
1977             'description': video_description,
1978             'thumbnail': thumbnail,
1979         }]
1980
1981 class VineIE(InfoExtractor):
1982     """Information Extractor for Vine.co"""
1983     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1984
1985     def _real_extract(self, url):
1986         mobj = re.match(self._VALID_URL, url)
1987
1988         video_id = mobj.group('id')
1989         webpage_url = 'https://vine.co/v/' + video_id
1990         webpage = self._download_webpage(webpage_url, video_id)
1991
1992         self.report_extraction(video_id)
1993
1994         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1995             webpage, u'video URL')
1996
1997         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1998             webpage, u'title')
1999
2000         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2001             webpage, u'thumbnail', fatal=False)
2002
2003         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2004             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2005
2006         return [{
2007             'id':        video_id,
2008             'url':       video_url,
2009             'ext':       'mp4',
2010             'title':     video_title,
2011             'thumbnail': thumbnail,
2012             'uploader':  uploader,
2013         }]
2014
2015 class FlickrIE(InfoExtractor):
2016     """Information Extractor for Flickr videos"""
2017     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2018
2019     def _real_extract(self, url):
2020         mobj = re.match(self._VALID_URL, url)
2021
2022         video_id = mobj.group('id')
2023         video_uploader_id = mobj.group('uploader_id')
2024         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2025         webpage = self._download_webpage(webpage_url, video_id)
2026
2027         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2028
2029         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2030         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2031
2032         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2033             first_xml, u'node_id')
2034
2035         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2036         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2037
2038         self.report_extraction(video_id)
2039
2040         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2041         if mobj is None:
2042             raise ExtractorError(u'Unable to extract video url')
2043         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2044
2045         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2046             webpage, u'video title')
2047
2048         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2049             webpage, u'description', fatal=False)
2050
2051         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2052             webpage, u'thumbnail', fatal=False)
2053
2054         return [{
2055             'id':          video_id,
2056             'url':         video_url,
2057             'ext':         'mp4',
2058             'title':       video_title,
2059             'description': video_description,
2060             'thumbnail':   thumbnail,
2061             'uploader_id': video_uploader_id,
2062         }]
2063
2064 class TeamcocoIE(InfoExtractor):
2065     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2066
2067     def _real_extract(self, url):
2068         mobj = re.match(self._VALID_URL, url)
2069         if mobj is None:
2070             raise ExtractorError(u'Invalid URL: %s' % url)
2071         url_title = mobj.group('url_title')
2072         webpage = self._download_webpage(url, url_title)
2073
2074         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2075             webpage, u'video id')
2076
2077         self.report_extraction(video_id)
2078
2079         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2080             webpage, u'title')
2081
2082         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2083             webpage, u'thumbnail', fatal=False)
2084
2085         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2086             webpage, u'description', fatal=False)
2087
2088         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2089         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2090
2091         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2092             data, u'video URL')
2093
2094         return [{
2095             'id':          video_id,
2096             'url':         video_url,
2097             'ext':         'mp4',
2098             'title':       video_title,
2099             'thumbnail':   thumbnail,
2100             'description': video_description,
2101         }]
2102
2103 class XHamsterIE(InfoExtractor):
2104     """Information Extractor for xHamster"""
2105     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2106
2107     def _real_extract(self,url):
2108         mobj = re.match(self._VALID_URL, url)
2109
2110         video_id = mobj.group('id')
2111         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2112         webpage = self._download_webpage(mrss_url, video_id)
2113
2114         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2115         if mobj is None:
2116             raise ExtractorError(u'Unable to extract media URL')
2117         if len(mobj.group('server')) == 0:
2118             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2119         else:
2120             video_url = mobj.group('server')+'/key='+mobj.group('file')
2121         video_extension = video_url.split('.')[-1]
2122
2123         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2124             webpage, u'title')
2125
2126         # Can't see the description anywhere in the UI
2127         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2128         #     webpage, u'description', fatal=False)
2129         # if video_description: video_description = unescapeHTML(video_description)
2130
2131         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2132         if mobj:
2133             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2134         else:
2135             video_upload_date = None
2136             self._downloader.report_warning(u'Unable to extract upload date')
2137
2138         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2139             webpage, u'uploader id', default=u'anonymous')
2140
2141         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2142             webpage, u'thumbnail', fatal=False)
2143
2144         return [{
2145             'id':       video_id,
2146             'url':      video_url,
2147             'ext':      video_extension,
2148             'title':    video_title,
2149             # 'description': video_description,
2150             'upload_date': video_upload_date,
2151             'uploader_id': video_uploader_id,
2152             'thumbnail': video_thumbnail
2153         }]
2154
2155 class HypemIE(InfoExtractor):
2156     """Information Extractor for hypem"""
2157     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2158
2159     def _real_extract(self, url):
2160         mobj = re.match(self._VALID_URL, url)
2161         if mobj is None:
2162             raise ExtractorError(u'Invalid URL: %s' % url)
2163         track_id = mobj.group(1)
2164
2165         data = { 'ax': 1, 'ts': time.time() }
2166         data_encoded = compat_urllib_parse.urlencode(data)
2167         complete_url = url + "?" + data_encoded
2168         request = compat_urllib_request.Request(complete_url)
2169         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2170         cookie = urlh.headers.get('Set-Cookie', '')
2171
2172         self.report_extraction(track_id)
2173
2174         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2175             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2176         try:
2177             track_list = json.loads(html_tracks)
2178             track = track_list[u'tracks'][0]
2179         except ValueError:
2180             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2181
2182         key = track[u"key"]
2183         track_id = track[u"id"]
2184         artist = track[u"artist"]
2185         title = track[u"song"]
2186
2187         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2188         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2189         request.add_header('cookie', cookie)
2190         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2191         try:
2192             song_data = json.loads(song_data_json)
2193         except ValueError:
2194             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2195         final_url = song_data[u"url"]
2196
2197         return [{
2198             'id':       track_id,
2199             'url':      final_url,
2200             'ext':      "mp3",
2201             'title':    title,
2202             'artist':   artist,
2203         }]
2204
2205 class Vbox7IE(InfoExtractor):
2206     """Information Extractor for Vbox7"""
2207     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2208
2209     def _real_extract(self,url):
2210         mobj = re.match(self._VALID_URL, url)
2211         if mobj is None:
2212             raise ExtractorError(u'Invalid URL: %s' % url)
2213         video_id = mobj.group(1)
2214
2215         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2216         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2217         redirect_url = urlh.geturl() + new_location
2218         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2219
2220         title = self._html_search_regex(r'<title>(.*)</title>',
2221             webpage, u'title').split('/')[0].strip()
2222
2223         ext = "flv"
2224         info_url = "http://vbox7.com/play/magare.do"
2225         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2226         info_request = compat_urllib_request.Request(info_url, data)
2227         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2228         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2229         if info_response is None:
2230             raise ExtractorError(u'Unable to extract the media url')
2231         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2232
2233         return [{
2234             'id':        video_id,
2235             'url':       final_url,
2236             'ext':       ext,
2237             'title':     title,
2238             'thumbnail': thumbnail_url,
2239         }]
2240
2241
2242 def gen_extractors():
2243     """ Return a list of an instance of every supported extractor.
2244     The order does matter; the first extractor matched is the one handling the URL.
2245     """
2246     return [
2247         YoutubePlaylistIE(),
2248         YoutubeChannelIE(),
2249         YoutubeUserIE(),
2250         YoutubeSearchIE(),
2251         YoutubeIE(),
2252         MetacafeIE(),
2253         DailymotionIE(),
2254         GoogleSearchIE(),
2255         PhotobucketIE(),
2256         YahooIE(),
2257         YahooSearchIE(),
2258         DepositFilesIE(),
2259         FacebookIE(),
2260         BlipTVIE(),
2261         BlipTVUserIE(),
2262         VimeoIE(),
2263         MyVideoIE(),
2264         ComedyCentralIE(),
2265         EscapistIE(),
2266         CollegeHumorIE(),
2267         XVideosIE(),
2268         SoundcloudSetIE(),
2269         SoundcloudIE(),
2270         InfoQIE(),
2271         MixcloudIE(),
2272         StanfordOpenClassroomIE(),
2273         MTVIE(),
2274         YoukuIE(),
2275         XNXXIE(),
2276         YouJizzIE(),
2277         PornotubeIE(),
2278         YouPornIE(),
2279         GooglePlusIE(),
2280         ArteTvIE(),
2281         NBAIE(),
2282         WorldStarHipHopIE(),
2283         JustinTVIE(),
2284         FunnyOrDieIE(),
2285         SteamIE(),
2286         UstreamIE(),
2287         RBMARadioIE(),
2288         EightTracksIE(),
2289         KeekIE(),
2290         TEDIE(),
2291         MySpassIE(),
2292         SpiegelIE(),
2293         LiveLeakIE(),
2294         ARDIE(),
2295         ZDFIE(),
2296         TumblrIE(),
2297         BandcampIE(),
2298         RedTubeIE(),
2299         InaIE(),
2300         HowcastIE(),
2301         VineIE(),
2302         FlickrIE(),
2303         TeamcocoIE(),
2304         XHamsterIE(),
2305         HypemIE(),
2306         Vbox7IE(),
2307         GametrailersIE(),
2308         StatigramIE(),
2309         GenericIE()
2310     ]
2311
2312 def get_info_extractor(ie_name):
2313     """Returns the info extractor class with the given ie_name"""
2314     return globals()[ie_name+'IE']