youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.dailymotion import DailymotionIE
  26 from .extractor.gametrailers import GametrailersIE
  27 from .extractor.generic import GenericIE
  28 from .extractor.google import GoogleSearchIE
  29 from .extractor.metacafe import MetacafeIE
  30 from .extractor.myvideo import MyVideoIE
  31 from .extractor.statigram import StatigramIE
  32 from .extractor.photobucket import PhotobucketIE
  33 from .extractor.vimeo import VimeoIE
  34 from .extractor.yahoo import YahooIE, YahooSearchIE
  35 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  36 from .extractor.zdf import ZDFIE
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56 class DepositFilesIE(InfoExtractor):
  57     """Information extractor for depositfiles.com"""
  58
  59     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
  60
  61     def _real_extract(self, url):
  62         file_id = url.split('/')[-1]
  63         # Rebuild url in english locale
  64         url = 'http://depositfiles.com/en/files/' + file_id
  65
  66         # Retrieve file webpage with 'Free download' button pressed
  67         free_download_indication = { 'gateway_result' : '1' }
  68         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
  69         try:
  70             self.report_download_webpage(file_id)
  71             webpage = compat_urllib_request.urlopen(request).read()
  72         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  73             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
  74
  75         # Search for the real file URL
  76         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
  77         if (mobj is None) or (mobj.group(1) is None):
  78             # Try to figure out reason of the error.
  79             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
  80             if (mobj is not None) and (mobj.group(1) is not None):
  81                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
  82                 raise ExtractorError(u'%s' % restriction_message)
  83             else:
  84                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
  85
  86         file_url = mobj.group(1)
  87         file_extension = os.path.splitext(file_url)[1][1:]
  88
  89         # Search for file title
  90         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
  91
  92         return [{
  93             'id':       file_id.decode('utf-8'),
  94             'url':      file_url.decode('utf-8'),
  95             'uploader': None,
  96             'upload_date':  None,
  97             'title':    file_title,
  98             'ext':      file_extension.decode('utf-8'),
  99         }]
 100
 101
 102 class FacebookIE(InfoExtractor):
 103     """Information Extractor for Facebook"""
 104
 105     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 106     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 107     _NETRC_MACHINE = 'facebook'
 108     IE_NAME = u'facebook'
 109
 110     def report_login(self):
 111         """Report attempt to log in."""
 112         self.to_screen(u'Logging in')
 113
 114     def _real_initialize(self):
 115         if self._downloader is None:
 116             return
 117
 118         useremail = None
 119         password = None
 120         downloader_params = self._downloader.params
 121
 122         # Attempt to use provided username and password or .netrc data
 123         if downloader_params.get('username', None) is not None:
 124             useremail = downloader_params['username']
 125             password = downloader_params['password']
 126         elif downloader_params.get('usenetrc', False):
 127             try:
 128                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 129                 if info is not None:
 130                     useremail = info[0]
 131                     password = info[2]
 132                 else:
 133                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 134             except (IOError, netrc.NetrcParseError) as err:
 135                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 136                 return
 137
 138         if useremail is None:
 139             return
 140
 141         # Log in
 142         login_form = {
 143             'email': useremail,
 144             'pass': password,
 145             'login': 'Log+In'
 146             }
 147         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 148         try:
 149             self.report_login()
 150             login_results = compat_urllib_request.urlopen(request).read()
 151             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 152                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 153                 return
 154         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 155             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 156             return
 157
 158     def _real_extract(self, url):
 159         mobj = re.match(self._VALID_URL, url)
 160         if mobj is None:
 161             raise ExtractorError(u'Invalid URL: %s' % url)
 162         video_id = mobj.group('ID')
 163
 164         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 165         webpage = self._download_webpage(url, video_id)
 166
 167         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 168         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 169         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 170         if not m:
 171             raise ExtractorError(u'Cannot parse data')
 172         data = dict(json.loads(m.group(1)))
 173         params_raw = compat_urllib_parse.unquote(data['params'])
 174         params = json.loads(params_raw)
 175         video_data = params['video_data'][0]
 176         video_url = video_data.get('hd_src')
 177         if not video_url:
 178             video_url = video_data['sd_src']
 179         if not video_url:
 180             raise ExtractorError(u'Cannot find video URL')
 181         video_duration = int(video_data['video_duration'])
 182         thumbnail = video_data['thumbnail_src']
 183
 184         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 185             webpage, u'title')
 186
 187         info = {
 188             'id': video_id,
 189             'title': video_title,
 190             'url': video_url,
 191             'ext': 'mp4',
 192             'duration': video_duration,
 193             'thumbnail': thumbnail,
 194         }
 195         return [info]
 196
 197
 198
 199
 200
 201
 202
 203 class EscapistIE(InfoExtractor):
 204     """Information extractor for The Escapist """
 205
 206     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 207     IE_NAME = u'escapist'
 208
 209     def _real_extract(self, url):
 210         mobj = re.match(self._VALID_URL, url)
 211         if mobj is None:
 212             raise ExtractorError(u'Invalid URL: %s' % url)
 213         showName = mobj.group('showname')
 214         videoId = mobj.group('episode')
 215
 216         self.report_extraction(videoId)
 217         webpage = self._download_webpage(url, videoId)
 218
 219         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 220             webpage, u'description', fatal=False)
 221
 222         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 223             webpage, u'thumbnail', fatal=False)
 224
 225         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 226             webpage, u'player url')
 227
 228         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 229             webpage, u'player url').split(' : ')[-1]
 230
 231         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 232         configUrl = compat_urllib_parse.unquote(configUrl)
 233
 234         configJSON = self._download_webpage(configUrl, videoId,
 235                                             u'Downloading configuration',
 236                                             u'unable to download configuration')
 237
 238         # Technically, it's JavaScript, not JSON
 239         configJSON = configJSON.replace("'", '"')
 240
 241         try:
 242             config = json.loads(configJSON)
 243         except (ValueError,) as err:
 244             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 245
 246         playlist = config['playlist']
 247         videoUrl = playlist[1]['url']
 248
 249         info = {
 250             'id': videoId,
 251             'url': videoUrl,
 252             'uploader': showName,
 253             'upload_date': None,
 254             'title': title,
 255             'ext': 'mp4',
 256             'thumbnail': imgUrl,
 257             'description': videoDesc,
 258             'player_url': playerUrl,
 259         }
 260
 261         return [info]
 262
 263 class CollegeHumorIE(InfoExtractor):
 264     """Information extractor for collegehumor.com"""
 265
 266     _WORKING = False
 267     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 268     IE_NAME = u'collegehumor'
 269
 270     def report_manifest(self, video_id):
 271         """Report information extraction."""
 272         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 273
 274     def _real_extract(self, url):
 275         mobj = re.match(self._VALID_URL, url)
 276         if mobj is None:
 277             raise ExtractorError(u'Invalid URL: %s' % url)
 278         video_id = mobj.group('videoid')
 279
 280         info = {
 281             'id': video_id,
 282             'uploader': None,
 283             'upload_date': None,
 284         }
 285
 286         self.report_extraction(video_id)
 287         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 288         try:
 289             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 290         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 291             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 292
 293         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 294         try:
 295             videoNode = mdoc.findall('./video')[0]
 296             info['description'] = videoNode.findall('./description')[0].text
 297             info['title'] = videoNode.findall('./caption')[0].text
 298             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 299             manifest_url = videoNode.findall('./file')[0].text
 300         except IndexError:
 301             raise ExtractorError(u'Invalid metadata XML file')
 302
 303         manifest_url += '?hdcore=2.10.3'
 304         self.report_manifest(video_id)
 305         try:
 306             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 308             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 309
 310         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 311         try:
 312             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 313             node_id = media_node.attrib['url']
 314             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 315         except IndexError as err:
 316             raise ExtractorError(u'Invalid manifest file')
 317
 318         url_pr = compat_urllib_parse_urlparse(manifest_url)
 319         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 320
 321         info['url'] = url
 322         info['ext'] = 'f4f'
 323         return [info]
 324
 325
 326 class XVideosIE(InfoExtractor):
 327     """Information extractor for xvideos.com"""
 328
 329     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 330     IE_NAME = u'xvideos'
 331
 332     def _real_extract(self, url):
 333         mobj = re.match(self._VALID_URL, url)
 334         if mobj is None:
 335             raise ExtractorError(u'Invalid URL: %s' % url)
 336         video_id = mobj.group(1)
 337
 338         webpage = self._download_webpage(url, video_id)
 339
 340         self.report_extraction(video_id)
 341
 342         # Extract video URL
 343         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 344             webpage, u'video URL'))
 345
 346         # Extract title
 347         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 348             webpage, u'title')
 349
 350         # Extract video thumbnail
 351         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 352             webpage, u'thumbnail', fatal=False)
 353
 354         info = {
 355             'id': video_id,
 356             'url': video_url,
 357             'uploader': None,
 358             'upload_date': None,
 359             'title': video_title,
 360             'ext': 'flv',
 361             'thumbnail': video_thumbnail,
 362             'description': None,
 363         }
 364
 365         return [info]
 366
 367
 368 class SoundcloudIE(InfoExtractor):
 369     """Information extractor for soundcloud.com
 370        To access the media, the uid of the song and a stream token
 371        must be extracted from the page source and the script must make
 372        a request to media.soundcloud.com/crossdomain.xml. Then
 373        the media can be grabbed by requesting from an url composed
 374        of the stream token and uid
 375      """
 376
 377     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 378     IE_NAME = u'soundcloud'
 379
 380     def report_resolve(self, video_id):
 381         """Report information extraction."""
 382         self.to_screen(u'%s: Resolving id' % video_id)
 383
 384     def _real_extract(self, url):
 385         mobj = re.match(self._VALID_URL, url)
 386         if mobj is None:
 387             raise ExtractorError(u'Invalid URL: %s' % url)
 388
 389         # extract uploader (which is in the url)
 390         uploader = mobj.group(1)
 391         # extract simple title (uploader + slug of song title)
 392         slug_title =  mobj.group(2)
 393         simple_title = uploader + u'-' + slug_title
 394         full_title = '%s/%s' % (uploader, slug_title)
 395
 396         self.report_resolve(full_title)
 397
 398         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 399         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 400         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 401
 402         info = json.loads(info_json)
 403         video_id = info['id']
 404         self.report_extraction(full_title)
 405
 406         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 407         stream_json = self._download_webpage(streams_url, full_title,
 408                                              u'Downloading stream definitions',
 409                                              u'unable to download stream definitions')
 410
 411         streams = json.loads(stream_json)
 412         mediaURL = streams['http_mp3_128_url']
 413         upload_date = unified_strdate(info['created_at'])
 414
 415         return [{
 416             'id':       info['id'],
 417             'url':      mediaURL,
 418             'uploader': info['user']['username'],
 419             'upload_date': upload_date,
 420             'title':    info['title'],
 421             'ext':      u'mp3',
 422             'description': info['description'],
 423         }]
 424
 425 class SoundcloudSetIE(InfoExtractor):
 426     """Information extractor for soundcloud.com sets
 427        To access the media, the uid of the song and a stream token
 428        must be extracted from the page source and the script must make
 429        a request to media.soundcloud.com/crossdomain.xml. Then
 430        the media can be grabbed by requesting from an url composed
 431        of the stream token and uid
 432      """
 433
 434     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 435     IE_NAME = u'soundcloud:set'
 436
 437     def report_resolve(self, video_id):
 438         """Report information extraction."""
 439         self.to_screen(u'%s: Resolving id' % video_id)
 440
 441     def _real_extract(self, url):
 442         mobj = re.match(self._VALID_URL, url)
 443         if mobj is None:
 444             raise ExtractorError(u'Invalid URL: %s' % url)
 445
 446         # extract uploader (which is in the url)
 447         uploader = mobj.group(1)
 448         # extract simple title (uploader + slug of song title)
 449         slug_title =  mobj.group(2)
 450         simple_title = uploader + u'-' + slug_title
 451         full_title = '%s/sets/%s' % (uploader, slug_title)
 452
 453         self.report_resolve(full_title)
 454
 455         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 456         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 457         info_json = self._download_webpage(resolv_url, full_title)
 458
 459         videos = []
 460         info = json.loads(info_json)
 461         if 'errors' in info:
 462             for err in info['errors']:
 463                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 464             return
 465
 466         self.report_extraction(full_title)
 467         for track in info['tracks']:
 468             video_id = track['id']
 469
 470             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 471             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
 472
 473             self.report_extraction(video_id)
 474             streams = json.loads(stream_json)
 475             mediaURL = streams['http_mp3_128_url']
 476
 477             videos.append({
 478                 'id':       video_id,
 479                 'url':      mediaURL,
 480                 'uploader': track['user']['username'],
 481                 'upload_date':  unified_strdate(track['created_at']),
 482                 'title':    track['title'],
 483                 'ext':      u'mp3',
 484                 'description': track['description'],
 485             })
 486         return videos
 487
 488
 489 class InfoQIE(InfoExtractor):
 490     """Information extractor for infoq.com"""
 491     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 492
 493     def _real_extract(self, url):
 494         mobj = re.match(self._VALID_URL, url)
 495         if mobj is None:
 496             raise ExtractorError(u'Invalid URL: %s' % url)
 497
 498         webpage = self._download_webpage(url, video_id=url)
 499         self.report_extraction(url)
 500
 501         # Extract video URL
 502         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 503         if mobj is None:
 504             raise ExtractorError(u'Unable to extract video url')
 505         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 506         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 507
 508         # Extract title
 509         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 510             webpage, u'title')
 511
 512         # Extract description
 513         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 514             webpage, u'description', fatal=False)
 515
 516         video_filename = video_url.split('/')[-1]
 517         video_id, extension = video_filename.split('.')
 518
 519         info = {
 520             'id': video_id,
 521             'url': video_url,
 522             'uploader': None,
 523             'upload_date': None,
 524             'title': video_title,
 525             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 526             'thumbnail': None,
 527             'description': video_description,
 528         }
 529
 530         return [info]
 531
 532 class MixcloudIE(InfoExtractor):
 533     """Information extractor for www.mixcloud.com"""
 534
 535     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 536     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 537     IE_NAME = u'mixcloud'
 538
 539     def report_download_json(self, file_id):
 540         """Report JSON download."""
 541         self.to_screen(u'Downloading json')
 542
 543     def get_urls(self, jsonData, fmt, bitrate='best'):
 544         """Get urls from 'audio_formats' section in json"""
 545         file_url = None
 546         try:
 547             bitrate_list = jsonData[fmt]
 548             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 549                 bitrate = max(bitrate_list) # select highest
 550
 551             url_list = jsonData[fmt][bitrate]
 552         except TypeError: # we have no bitrate info.
 553             url_list = jsonData[fmt]
 554         return url_list
 555
 556     def check_urls(self, url_list):
 557         """Returns 1st active url from list"""
 558         for url in url_list:
 559             try:
 560                 compat_urllib_request.urlopen(url)
 561                 return url
 562             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 563                 url = None
 564
 565         return None
 566
 567     def _print_formats(self, formats):
 568         print('Available formats:')
 569         for fmt in formats.keys():
 570             for b in formats[fmt]:
 571                 try:
 572                     ext = formats[fmt][b][0]
 573                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 574                 except TypeError: # we have no bitrate info
 575                     ext = formats[fmt][0]
 576                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 577                     break
 578
 579     def _real_extract(self, url):
 580         mobj = re.match(self._VALID_URL, url)
 581         if mobj is None:
 582             raise ExtractorError(u'Invalid URL: %s' % url)
 583         # extract uploader & filename from url
 584         uploader = mobj.group(1).decode('utf-8')
 585         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 586
 587         # construct API request
 588         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 589         # retrieve .json file with links to files
 590         request = compat_urllib_request.Request(file_url)
 591         try:
 592             self.report_download_json(file_url)
 593             jsonData = compat_urllib_request.urlopen(request).read()
 594         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 595             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 596
 597         # parse JSON
 598         json_data = json.loads(jsonData)
 599         player_url = json_data['player_swf_url']
 600         formats = dict(json_data['audio_formats'])
 601
 602         req_format = self._downloader.params.get('format', None)
 603         bitrate = None
 604
 605         if self._downloader.params.get('listformats', None):
 606             self._print_formats(formats)
 607             return
 608
 609         if req_format is None or req_format == 'best':
 610             for format_param in formats.keys():
 611                 url_list = self.get_urls(formats, format_param)
 612                 # check urls
 613                 file_url = self.check_urls(url_list)
 614                 if file_url is not None:
 615                     break # got it!
 616         else:
 617             if req_format not in formats:
 618                 raise ExtractorError(u'Format is not available')
 619
 620             url_list = self.get_urls(formats, req_format)
 621             file_url = self.check_urls(url_list)
 622             format_param = req_format
 623
 624         return [{
 625             'id': file_id.decode('utf-8'),
 626             'url': file_url.decode('utf-8'),
 627             'uploader': uploader.decode('utf-8'),
 628             'upload_date': None,
 629             'title': json_data['name'],
 630             'ext': file_url.split('.')[-1].decode('utf-8'),
 631             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 632             'thumbnail': json_data['thumbnail_url'],
 633             'description': json_data['description'],
 634             'player_url': player_url.decode('utf-8'),
 635         }]
 636
 637 class StanfordOpenClassroomIE(InfoExtractor):
 638     """Information extractor for Stanford's Open ClassRoom"""
 639
 640     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 641     IE_NAME = u'stanfordoc'
 642
 643     def _real_extract(self, url):
 644         mobj = re.match(self._VALID_URL, url)
 645         if mobj is None:
 646             raise ExtractorError(u'Invalid URL: %s' % url)
 647
 648         if mobj.group('course') and mobj.group('video'): # A specific video
 649             course = mobj.group('course')
 650             video = mobj.group('video')
 651             info = {
 652                 'id': course + '_' + video,
 653                 'uploader': None,
 654                 'upload_date': None,
 655             }
 656
 657             self.report_extraction(info['id'])
 658             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 659             xmlUrl = baseUrl + video + '.xml'
 660             try:
 661                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 662             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 663                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 664             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 665             try:
 666                 info['title'] = mdoc.findall('./title')[0].text
 667                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 668             except IndexError:
 669                 raise ExtractorError(u'Invalid metadata XML file')
 670             info['ext'] = info['url'].rpartition('.')[2]
 671             return [info]
 672         elif mobj.group('course'): # A course page
 673             course = mobj.group('course')
 674             info = {
 675                 'id': course,
 676                 'type': 'playlist',
 677                 'uploader': None,
 678                 'upload_date': None,
 679             }
 680
 681             coursepage = self._download_webpage(url, info['id'],
 682                                         note='Downloading course info page',
 683                                         errnote='Unable to download course info page')
 684
 685             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 686
 687             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 688                 coursepage, u'description', fatal=False)
 689
 690             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 691             info['list'] = [
 692                 {
 693                     'type': 'reference',
 694                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 695                 }
 696                     for vpage in links]
 697             results = []
 698             for entry in info['list']:
 699                 assert entry['type'] == 'reference'
 700                 results += self.extract(entry['url'])
 701             return results
 702         else: # Root page
 703             info = {
 704                 'id': 'Stanford OpenClassroom',
 705                 'type': 'playlist',
 706                 'uploader': None,
 707                 'upload_date': None,
 708             }
 709
 710             self.report_download_webpage(info['id'])
 711             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 712             try:
 713                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 714             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 715                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 716
 717             info['title'] = info['id']
 718
 719             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 720             info['list'] = [
 721                 {
 722                     'type': 'reference',
 723                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 724                 }
 725                     for cpage in links]
 726
 727             results = []
 728             for entry in info['list']:
 729                 assert entry['type'] == 'reference'
 730                 results += self.extract(entry['url'])
 731             return results
 732
 733 class MTVIE(InfoExtractor):
 734     """Information extractor for MTV.com"""
 735
 736     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 737     IE_NAME = u'mtv'
 738
 739     def _real_extract(self, url):
 740         mobj = re.match(self._VALID_URL, url)
 741         if mobj is None:
 742             raise ExtractorError(u'Invalid URL: %s' % url)
 743         if not mobj.group('proto'):
 744             url = 'http://' + url
 745         video_id = mobj.group('videoid')
 746
 747         webpage = self._download_webpage(url, video_id)
 748
 749         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 750             webpage, u'song name', fatal=False)
 751
 752         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 753             webpage, u'title')
 754
 755         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 756             webpage, u'mtvn_uri', fatal=False)
 757
 758         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 759             webpage, u'content id', fatal=False)
 760
 761         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 762         self.report_extraction(video_id)
 763         request = compat_urllib_request.Request(videogen_url)
 764         try:
 765             metadataXml = compat_urllib_request.urlopen(request).read()
 766         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 767             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 768
 769         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 770         renditions = mdoc.findall('.//rendition')
 771
 772         # For now, always pick the highest quality.
 773         rendition = renditions[-1]
 774
 775         try:
 776             _,_,ext = rendition.attrib['type'].partition('/')
 777             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 778             video_url = rendition.find('./src').text
 779         except KeyError:
 780             raise ExtractorError('Invalid rendition field.')
 781
 782         info = {
 783             'id': video_id,
 784             'url': video_url,
 785             'uploader': performer,
 786             'upload_date': None,
 787             'title': video_title,
 788             'ext': ext,
 789             'format': format,
 790         }
 791
 792         return [info]
 793
 794
 795 class YoukuIE(InfoExtractor):
 796     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 797
 798     def _gen_sid(self):
 799         nowTime = int(time.time() * 1000)
 800         random1 = random.randint(1000,1998)
 801         random2 = random.randint(1000,9999)
 802
 803         return "%d%d%d" %(nowTime,random1,random2)
 804
 805     def _get_file_ID_mix_string(self, seed):
 806         mixed = []
 807         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 808         seed = float(seed)
 809         for i in range(len(source)):
 810             seed  =  (seed * 211 + 30031 ) % 65536
 811             index  =  math.floor(seed / 65536 * len(source) )
 812             mixed.append(source[int(index)])
 813             source.remove(source[int(index)])
 814         #return ''.join(mixed)
 815         return mixed
 816
 817     def _get_file_id(self, fileId, seed):
 818         mixed = self._get_file_ID_mix_string(seed)
 819         ids = fileId.split('*')
 820         realId = []
 821         for ch in ids:
 822             if ch:
 823                 realId.append(mixed[int(ch)])
 824         return ''.join(realId)
 825
 826     def _real_extract(self, url):
 827         mobj = re.match(self._VALID_URL, url)
 828         if mobj is None:
 829             raise ExtractorError(u'Invalid URL: %s' % url)
 830         video_id = mobj.group('ID')
 831
 832         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 833
 834         jsondata = self._download_webpage(info_url, video_id)
 835
 836         self.report_extraction(video_id)
 837         try:
 838             config = json.loads(jsondata)
 839
 840             video_title =  config['data'][0]['title']
 841             seed = config['data'][0]['seed']
 842
 843             format = self._downloader.params.get('format', None)
 844             supported_format = list(config['data'][0]['streamfileids'].keys())
 845
 846             if format is None or format == 'best':
 847                 if 'hd2' in supported_format:
 848                     format = 'hd2'
 849                 else:
 850                     format = 'flv'
 851                 ext = u'flv'
 852             elif format == 'worst':
 853                 format = 'mp4'
 854                 ext = u'mp4'
 855             else:
 856                 format = 'flv'
 857                 ext = u'flv'
 858
 859
 860             fileid = config['data'][0]['streamfileids'][format]
 861             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 862         except (UnicodeDecodeError, ValueError, KeyError):
 863             raise ExtractorError(u'Unable to extract info section')
 864
 865         files_info=[]
 866         sid = self._gen_sid()
 867         fileid = self._get_file_id(fileid, seed)
 868
 869         #column 8,9 of fileid represent the segment number
 870         #fileid[7:9] should be changed
 871         for index, key in enumerate(keys):
 872
 873             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 874             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 875
 876             info = {
 877                 'id': '%s_part%02d' % (video_id, index),
 878                 'url': download_url,
 879                 'uploader': None,
 880                 'upload_date': None,
 881                 'title': video_title,
 882                 'ext': ext,
 883             }
 884             files_info.append(info)
 885
 886         return files_info
 887
 888
 889 class XNXXIE(InfoExtractor):
 890     """Information extractor for xnxx.com"""
 891
 892     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 893     IE_NAME = u'xnxx'
 894     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 895     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 896     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 897
 898     def _real_extract(self, url):
 899         mobj = re.match(self._VALID_URL, url)
 900         if mobj is None:
 901             raise ExtractorError(u'Invalid URL: %s' % url)
 902         video_id = mobj.group(1)
 903
 904         # Get webpage content
 905         webpage = self._download_webpage(url, video_id)
 906
 907         video_url = self._search_regex(self.VIDEO_URL_RE,
 908             webpage, u'video URL')
 909         video_url = compat_urllib_parse.unquote(video_url)
 910
 911         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 912             webpage, u'title')
 913
 914         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 915             webpage, u'thumbnail', fatal=False)
 916
 917         return [{
 918             'id': video_id,
 919             'url': video_url,
 920             'uploader': None,
 921             'upload_date': None,
 922             'title': video_title,
 923             'ext': 'flv',
 924             'thumbnail': video_thumbnail,
 925             'description': None,
 926         }]
 927
 928
 929 class GooglePlusIE(InfoExtractor):
 930     """Information extractor for plus.google.com."""
 931
 932     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
 933     IE_NAME = u'plus.google'
 934
 935     def _real_extract(self, url):
 936         # Extract id from URL
 937         mobj = re.match(self._VALID_URL, url)
 938         if mobj is None:
 939             raise ExtractorError(u'Invalid URL: %s' % url)
 940
 941         post_url = mobj.group(0)
 942         video_id = mobj.group(1)
 943
 944         video_extension = 'flv'
 945
 946         # Step 1, Retrieve post webpage to extract further information
 947         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
 948
 949         self.report_extraction(video_id)
 950
 951         # Extract update date
 952         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
 953             webpage, u'upload date', fatal=False)
 954         if upload_date:
 955             # Convert timestring to a format suitable for filename
 956             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
 957             upload_date = upload_date.strftime('%Y%m%d')
 958
 959         # Extract uploader
 960         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
 961             webpage, u'uploader', fatal=False)
 962
 963         # Extract title
 964         # Get the first line for title
 965         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
 966             webpage, 'title', default=u'NA')
 967
 968         # Step 2, Stimulate clicking the image box to launch video
 969         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
 970             webpage, u'video page URL')
 971         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
 972
 973         # Extract video links on video page
 974         """Extract video links of all sizes"""
 975         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
 976         mobj = re.findall(pattern, webpage)
 977         if len(mobj) == 0:
 978             raise ExtractorError(u'Unable to extract video links')
 979
 980         # Sort in resolution
 981         links = sorted(mobj)
 982
 983         # Choose the lowest of the sort, i.e. highest resolution
 984         video_url = links[-1]
 985         # Only get the url. The resolution part in the tuple has no use anymore
 986         video_url = video_url[-1]
 987         # Treat escaped \u0026 style hex
 988         try:
 989             video_url = video_url.decode("unicode_escape")
 990         except AttributeError: # Python 3
 991             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
 992
 993
 994         return [{
 995             'id':       video_id,
 996             'url':      video_url,
 997             'uploader': uploader,
 998             'upload_date':  upload_date,
 999             'title':    video_title,
1000             'ext':      video_extension,
1001         }]
1002
1003 class NBAIE(InfoExtractor):
1004     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1005     IE_NAME = u'nba'
1006
1007     def _real_extract(self, url):
1008         mobj = re.match(self._VALID_URL, url)
1009         if mobj is None:
1010             raise ExtractorError(u'Invalid URL: %s' % url)
1011
1012         video_id = mobj.group(1)
1013
1014         webpage = self._download_webpage(url, video_id)
1015
1016         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1017
1018         shortened_video_id = video_id.rpartition('/')[2]
1019         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1020             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1021
1022         # It isn't there in the HTML it returns to us
1023         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1024
1025         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1026
1027         info = {
1028             'id': shortened_video_id,
1029             'url': video_url,
1030             'ext': 'mp4',
1031             'title': title,
1032             # 'uploader_date': uploader_date,
1033             'description': description,
1034         }
1035         return [info]
1036
1037 class JustinTVIE(InfoExtractor):
1038     """Information extractor for justin.tv and twitch.tv"""
1039     # TODO: One broadcast may be split into multiple videos. The key
1040     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1041     # starts at 1 and increases. Can we treat all parts as one video?
1042
1043     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1044         (?:
1045             (?P<channelid>[^/]+)|
1046             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1047             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1048         )
1049         /?(?:\#.*)?$
1050         """
1051     _JUSTIN_PAGE_LIMIT = 100
1052     IE_NAME = u'justin.tv'
1053
1054     def report_download_page(self, channel, offset):
1055         """Report attempt to download a single page of videos."""
1056         self.to_screen(u'%s: Downloading video information from %d to %d' %
1057                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1058
1059     # Return count of items, list of *valid* items
1060     def _parse_page(self, url, video_id):
1061         webpage = self._download_webpage(url, video_id,
1062                                          u'Downloading video info JSON',
1063                                          u'unable to download video info JSON')
1064
1065         response = json.loads(webpage)
1066         if type(response) != list:
1067             error_text = response.get('error', 'unknown error')
1068             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1069         info = []
1070         for clip in response:
1071             video_url = clip['video_file_url']
1072             if video_url:
1073                 video_extension = os.path.splitext(video_url)[1][1:]
1074                 video_date = re.sub('-', '', clip['start_time'][:10])
1075                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1076                 video_id = clip['id']
1077                 video_title = clip.get('title', video_id)
1078                 info.append({
1079                     'id': video_id,
1080                     'url': video_url,
1081                     'title': video_title,
1082                     'uploader': clip.get('channel_name', video_uploader_id),
1083                     'uploader_id': video_uploader_id,
1084                     'upload_date': video_date,
1085                     'ext': video_extension,
1086                 })
1087         return (len(response), info)
1088
1089     def _real_extract(self, url):
1090         mobj = re.match(self._VALID_URL, url)
1091         if mobj is None:
1092             raise ExtractorError(u'invalid URL: %s' % url)
1093
1094         api_base = 'http://api.justin.tv'
1095         paged = False
1096         if mobj.group('channelid'):
1097             paged = True
1098             video_id = mobj.group('channelid')
1099             api = api_base + '/channel/archives/%s.json' % video_id
1100         elif mobj.group('chapterid'):
1101             chapter_id = mobj.group('chapterid')
1102
1103             webpage = self._download_webpage(url, chapter_id)
1104             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1105             if not m:
1106                 raise ExtractorError(u'Cannot find archive of a chapter')
1107             archive_id = m.group(1)
1108
1109             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1110             chapter_info_xml = self._download_webpage(api, chapter_id,
1111                                              note=u'Downloading chapter information',
1112                                              errnote=u'Chapter information download failed')
1113             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1114             for a in doc.findall('.//archive'):
1115                 if archive_id == a.find('./id').text:
1116                     break
1117             else:
1118                 raise ExtractorError(u'Could not find chapter in chapter information')
1119
1120             video_url = a.find('./video_file_url').text
1121             video_ext = video_url.rpartition('.')[2] or u'flv'
1122
1123             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1124             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1125                                    note='Downloading chapter metadata',
1126                                    errnote='Download of chapter metadata failed')
1127             chapter_info = json.loads(chapter_info_json)
1128
1129             bracket_start = int(doc.find('.//bracket_start').text)
1130             bracket_end = int(doc.find('.//bracket_end').text)
1131
1132             # TODO determine start (and probably fix up file)
1133             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1134             #video_url += u'?start=' + TODO:start_timestamp
1135             # bracket_start is 13290, but we want 51670615
1136             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1137                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1138
1139             info = {
1140                 'id': u'c' + chapter_id,
1141                 'url': video_url,
1142                 'ext': video_ext,
1143                 'title': chapter_info['title'],
1144                 'thumbnail': chapter_info['preview'],
1145                 'description': chapter_info['description'],
1146                 'uploader': chapter_info['channel']['display_name'],
1147                 'uploader_id': chapter_info['channel']['name'],
1148             }
1149             return [info]
1150         else:
1151             video_id = mobj.group('videoid')
1152             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1153
1154         self.report_extraction(video_id)
1155
1156         info = []
1157         offset = 0
1158         limit = self._JUSTIN_PAGE_LIMIT
1159         while True:
1160             if paged:
1161                 self.report_download_page(video_id, offset)
1162             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1163             page_count, page_info = self._parse_page(page_url, video_id)
1164             info.extend(page_info)
1165             if not paged or page_count != limit:
1166                 break
1167             offset += limit
1168         return info
1169
1170 class FunnyOrDieIE(InfoExtractor):
1171     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1172
1173     def _real_extract(self, url):
1174         mobj = re.match(self._VALID_URL, url)
1175         if mobj is None:
1176             raise ExtractorError(u'invalid URL: %s' % url)
1177
1178         video_id = mobj.group('id')
1179         webpage = self._download_webpage(url, video_id)
1180
1181         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1182             webpage, u'video URL', flags=re.DOTALL)
1183
1184         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1185             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1186
1187         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1188             webpage, u'description', fatal=False, flags=re.DOTALL)
1189
1190         info = {
1191             'id': video_id,
1192             'url': video_url,
1193             'ext': 'mp4',
1194             'title': title,
1195             'description': video_description,
1196         }
1197         return [info]
1198
1199 class SteamIE(InfoExtractor):
1200     _VALID_URL = r"""http://store\.steampowered\.com/
1201                 (agecheck/)?
1202                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1203                 (?P<gameID>\d+)/?
1204                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1205                 """
1206     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1207     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1208
1209     @classmethod
1210     def suitable(cls, url):
1211         """Receives a URL and returns True if suitable for this IE."""
1212         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1213
1214     def _real_extract(self, url):
1215         m = re.match(self._VALID_URL, url, re.VERBOSE)
1216         gameID = m.group('gameID')
1217
1218         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1219         webpage = self._download_webpage(videourl, gameID)
1220
1221         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1222             videourl = self._AGECHECK_TEMPLATE % gameID
1223             self.report_age_confirmation()
1224             webpage = self._download_webpage(videourl, gameID)
1225
1226         self.report_extraction(gameID)
1227         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1228                                              webpage, 'game title')
1229
1230         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1231         mweb = re.finditer(urlRE, webpage)
1232         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1233         titles = re.finditer(namesRE, webpage)
1234         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1235         thumbs = re.finditer(thumbsRE, webpage)
1236         videos = []
1237         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1238             video_id = vid.group('videoID')
1239             title = vtitle.group('videoName')
1240             video_url = vid.group('videoURL')
1241             video_thumb = thumb.group('thumbnail')
1242             if not video_url:
1243                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1244             info = {
1245                 'id':video_id,
1246                 'url':video_url,
1247                 'ext': 'flv',
1248                 'title': unescapeHTML(title),
1249                 'thumbnail': video_thumb
1250                   }
1251             videos.append(info)
1252         return [self.playlist_result(videos, gameID, game_title)]
1253
1254 class UstreamIE(InfoExtractor):
1255     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1256     IE_NAME = u'ustream'
1257
1258     def _real_extract(self, url):
1259         m = re.match(self._VALID_URL, url)
1260         video_id = m.group('videoID')
1261
1262         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1263         webpage = self._download_webpage(url, video_id)
1264
1265         self.report_extraction(video_id)
1266
1267         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1268             webpage, u'title')
1269
1270         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1271             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1272
1273         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1274             webpage, u'thumbnail', fatal=False)
1275
1276         info = {
1277                 'id': video_id,
1278                 'url': video_url,
1279                 'ext': 'flv',
1280                 'title': video_title,
1281                 'uploader': uploader,
1282                 'thumbnail': thumbnail,
1283                }
1284         return info
1285
1286 class WorldStarHipHopIE(InfoExtractor):
1287     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1288     IE_NAME = u'WorldStarHipHop'
1289
1290     def _real_extract(self, url):
1291         m = re.match(self._VALID_URL, url)
1292         video_id = m.group('id')
1293
1294         webpage_src = self._download_webpage(url, video_id)
1295
1296         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1297             webpage_src, u'video URL')
1298
1299         if 'mp4' in video_url:
1300             ext = 'mp4'
1301         else:
1302             ext = 'flv'
1303
1304         video_title = self._html_search_regex(r"<title>(.*)</title>",
1305             webpage_src, u'title')
1306
1307         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1308         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1309             webpage_src, u'thumbnail', fatal=False)
1310
1311         if not thumbnail:
1312             _title = r"""candytitles.*>(.*)</span>"""
1313             mobj = re.search(_title, webpage_src)
1314             if mobj is not None:
1315                 video_title = mobj.group(1)
1316
1317         results = [{
1318                     'id': video_id,
1319                     'url' : video_url,
1320                     'title' : video_title,
1321                     'thumbnail' : thumbnail,
1322                     'ext' : ext,
1323                     }]
1324         return results
1325
1326 class RBMARadioIE(InfoExtractor):
1327     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1328
1329     def _real_extract(self, url):
1330         m = re.match(self._VALID_URL, url)
1331         video_id = m.group('videoID')
1332
1333         webpage = self._download_webpage(url, video_id)
1334
1335         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1336             webpage, u'json data', flags=re.MULTILINE)
1337
1338         try:
1339             data = json.loads(json_data)
1340         except ValueError as e:
1341             raise ExtractorError(u'Invalid JSON: ' + str(e))
1342
1343         video_url = data['akamai_url'] + '&cbr=256'
1344         url_parts = compat_urllib_parse_urlparse(video_url)
1345         video_ext = url_parts.path.rpartition('.')[2]
1346         info = {
1347                 'id': video_id,
1348                 'url': video_url,
1349                 'ext': video_ext,
1350                 'title': data['title'],
1351                 'description': data.get('teaser_text'),
1352                 'location': data.get('country_of_origin'),
1353                 'uploader': data.get('host', {}).get('name'),
1354                 'uploader_id': data.get('host', {}).get('slug'),
1355                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1356                 'duration': data.get('duration'),
1357         }
1358         return [info]
1359
1360
1361 class YouPornIE(InfoExtractor):
1362     """Information extractor for youporn.com."""
1363     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1364
1365     def _print_formats(self, formats):
1366         """Print all available formats"""
1367         print(u'Available formats:')
1368         print(u'ext\t\tformat')
1369         print(u'---------------------------------')
1370         for format in formats:
1371             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1372
1373     def _specific(self, req_format, formats):
1374         for x in formats:
1375             if(x["format"]==req_format):
1376                 return x
1377         return None
1378
1379     def _real_extract(self, url):
1380         mobj = re.match(self._VALID_URL, url)
1381         if mobj is None:
1382             raise ExtractorError(u'Invalid URL: %s' % url)
1383         video_id = mobj.group('videoid')
1384
1385         req = compat_urllib_request.Request(url)
1386         req.add_header('Cookie', 'age_verified=1')
1387         webpage = self._download_webpage(req, video_id)
1388
1389         # Get JSON parameters
1390         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1391         try:
1392             params = json.loads(json_params)
1393         except:
1394             raise ExtractorError(u'Invalid JSON')
1395
1396         self.report_extraction(video_id)
1397         try:
1398             video_title = params['title']
1399             upload_date = unified_strdate(params['release_date_f'])
1400             video_description = params['description']
1401             video_uploader = params['submitted_by']
1402             thumbnail = params['thumbnails'][0]['image']
1403         except KeyError:
1404             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1405
1406         # Get all of the formats available
1407         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1408         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1409             webpage, u'download list').strip()
1410
1411         # Get all of the links from the page
1412         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1413         links = re.findall(LINK_RE, download_list_html)
1414         if(len(links) == 0):
1415             raise ExtractorError(u'ERROR: no known formats available for video')
1416
1417         self.to_screen(u'Links found: %d' % len(links))
1418
1419         formats = []
1420         for link in links:
1421
1422             # A link looks like this:
1423             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1424             # A path looks like this:
1425             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1426             video_url = unescapeHTML( link )
1427             path = compat_urllib_parse_urlparse( video_url ).path
1428             extension = os.path.splitext( path )[1][1:]
1429             format = path.split('/')[4].split('_')[:2]
1430             size = format[0]
1431             bitrate = format[1]
1432             format = "-".join( format )
1433             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1434
1435             formats.append({
1436                 'id': video_id,
1437                 'url': video_url,
1438                 'uploader': video_uploader,
1439                 'upload_date': upload_date,
1440                 'title': video_title,
1441                 'ext': extension,
1442                 'format': format,
1443                 'thumbnail': thumbnail,
1444                 'description': video_description
1445             })
1446
1447         if self._downloader.params.get('listformats', None):
1448             self._print_formats(formats)
1449             return
1450
1451         req_format = self._downloader.params.get('format', None)
1452         self.to_screen(u'Format: %s' % req_format)
1453
1454         if req_format is None or req_format == 'best':
1455             return [formats[0]]
1456         elif req_format == 'worst':
1457             return [formats[-1]]
1458         elif req_format in ('-1', 'all'):
1459             return formats
1460         else:
1461             format = self._specific( req_format, formats )
1462             if result is None:
1463                 raise ExtractorError(u'Requested format not available')
1464             return [format]
1465
1466
1467
1468 class PornotubeIE(InfoExtractor):
1469     """Information extractor for pornotube.com."""
1470     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1471
1472     def _real_extract(self, url):
1473         mobj = re.match(self._VALID_URL, url)
1474         if mobj is None:
1475             raise ExtractorError(u'Invalid URL: %s' % url)
1476
1477         video_id = mobj.group('videoid')
1478         video_title = mobj.group('title')
1479
1480         # Get webpage content
1481         webpage = self._download_webpage(url, video_id)
1482
1483         # Get the video URL
1484         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1485         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1486         video_url = compat_urllib_parse.unquote(video_url)
1487
1488         #Get the uploaded date
1489         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1490         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1491         if upload_date: upload_date = unified_strdate(upload_date)
1492
1493         info = {'id': video_id,
1494                 'url': video_url,
1495                 'uploader': None,
1496                 'upload_date': upload_date,
1497                 'title': video_title,
1498                 'ext': 'flv',
1499                 'format': 'flv'}
1500
1501         return [info]
1502
1503 class YouJizzIE(InfoExtractor):
1504     """Information extractor for youjizz.com."""
1505     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1506
1507     def _real_extract(self, url):
1508         mobj = re.match(self._VALID_URL, url)
1509         if mobj is None:
1510             raise ExtractorError(u'Invalid URL: %s' % url)
1511
1512         video_id = mobj.group('videoid')
1513
1514         # Get webpage content
1515         webpage = self._download_webpage(url, video_id)
1516
1517         # Get the video title
1518         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1519             webpage, u'title').strip()
1520
1521         # Get the embed page
1522         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1523         if result is None:
1524             raise ExtractorError(u'ERROR: unable to extract embed page')
1525
1526         embed_page_url = result.group(0).strip()
1527         video_id = result.group('videoid')
1528
1529         webpage = self._download_webpage(embed_page_url, video_id)
1530
1531         # Get the video URL
1532         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1533             webpage, u'video URL')
1534
1535         info = {'id': video_id,
1536                 'url': video_url,
1537                 'title': video_title,
1538                 'ext': 'flv',
1539                 'format': 'flv',
1540                 'player_url': embed_page_url}
1541
1542         return [info]
1543
1544 class EightTracksIE(InfoExtractor):
1545     IE_NAME = '8tracks'
1546     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1547
1548     def _real_extract(self, url):
1549         mobj = re.match(self._VALID_URL, url)
1550         if mobj is None:
1551             raise ExtractorError(u'Invalid URL: %s' % url)
1552         playlist_id = mobj.group('id')
1553
1554         webpage = self._download_webpage(url, playlist_id)
1555
1556         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1557         data = json.loads(json_like)
1558
1559         session = str(random.randint(0, 1000000000))
1560         mix_id = data['id']
1561         track_count = data['tracks_count']
1562         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1563         next_url = first_url
1564         res = []
1565         for i in itertools.count():
1566             api_json = self._download_webpage(next_url, playlist_id,
1567                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1568                 errnote=u'Failed to download song information')
1569             api_data = json.loads(api_json)
1570             track_data = api_data[u'set']['track']
1571             info = {
1572                 'id': track_data['id'],
1573                 'url': track_data['track_file_stream_url'],
1574                 'title': track_data['performer'] + u' - ' + track_data['name'],
1575                 'raw_title': track_data['name'],
1576                 'uploader_id': data['user']['login'],
1577                 'ext': 'm4a',
1578             }
1579             res.append(info)
1580             if api_data['set']['at_last_track']:
1581                 break
1582             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1583         return res
1584
1585 class KeekIE(InfoExtractor):
1586     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1587     IE_NAME = u'keek'
1588
1589     def _real_extract(self, url):
1590         m = re.match(self._VALID_URL, url)
1591         video_id = m.group('videoID')
1592
1593         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1594         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1595         webpage = self._download_webpage(url, video_id)
1596
1597         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1598             webpage, u'title')
1599
1600         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1601             webpage, u'uploader', fatal=False)
1602
1603         info = {
1604                 'id': video_id,
1605                 'url': video_url,
1606                 'ext': 'mp4',
1607                 'title': video_title,
1608                 'thumbnail': thumbnail,
1609                 'uploader': uploader
1610         }
1611         return [info]
1612
1613 class TEDIE(InfoExtractor):
1614     _VALID_URL=r'''http://www\.ted\.com/
1615                    (
1616                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1617                         |
1618                         ((?P<type_talk>talks)) # We have a simple talk
1619                    )
1620                    (/lang/(.*?))? # The url may contain the language
1621                    /(?P<name>\w+) # Here goes the name and then ".html"
1622                    '''
1623
1624     @classmethod
1625     def suitable(cls, url):
1626         """Receives a URL and returns True if suitable for this IE."""
1627         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1628
1629     def _real_extract(self, url):
1630         m=re.match(self._VALID_URL, url, re.VERBOSE)
1631         if m.group('type_talk'):
1632             return [self._talk_info(url)]
1633         else :
1634             playlist_id=m.group('playlist_id')
1635             name=m.group('name')
1636             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1637             return [self._playlist_videos_info(url,name,playlist_id)]
1638
1639     def _playlist_videos_info(self,url,name,playlist_id=0):
1640         '''Returns the videos of the playlist'''
1641         video_RE=r'''
1642                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1643                      ([.\s]*?)data-playlist_item_id="(\d+)"
1644                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1645                      '''
1646         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1647         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1648         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1649         m_names=re.finditer(video_name_RE,webpage)
1650
1651         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1652                                                  webpage, 'playlist title')
1653
1654         playlist_entries = []
1655         for m_video, m_name in zip(m_videos,m_names):
1656             video_id=m_video.group('video_id')
1657             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1658             playlist_entries.append(self.url_result(talk_url, 'TED'))
1659         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1660
1661     def _talk_info(self, url, video_id=0):
1662         """Return the video for the talk in the url"""
1663         m = re.match(self._VALID_URL, url,re.VERBOSE)
1664         video_name = m.group('name')
1665         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1666         self.report_extraction(video_name)
1667         # If the url includes the language we get the title translated
1668         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1669                                         webpage, 'title')
1670         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1671                                     webpage, 'json data')
1672         info = json.loads(json_data)
1673         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1674                                        webpage, 'description', flags = re.DOTALL)
1675
1676         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1677                                        webpage, 'thumbnail')
1678         info = {
1679                 'id': info['id'],
1680                 'url': info['htmlStreams'][-1]['file'],
1681                 'ext': 'mp4',
1682                 'title': title,
1683                 'thumbnail': thumbnail,
1684                 'description': desc,
1685                 }
1686         return info
1687
1688 class MySpassIE(InfoExtractor):
1689     _VALID_URL = r'http://www.myspass.de/.*'
1690
1691     def _real_extract(self, url):
1692         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1693
1694         # video id is the last path element of the URL
1695         # usually there is a trailing slash, so also try the second but last
1696         url_path = compat_urllib_parse_urlparse(url).path
1697         url_parent_path, video_id = os.path.split(url_path)
1698         if not video_id:
1699             _, video_id = os.path.split(url_parent_path)
1700
1701         # get metadata
1702         metadata_url = META_DATA_URL_TEMPLATE % video_id
1703         metadata_text = self._download_webpage(metadata_url, video_id)
1704         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1705
1706         # extract values from metadata
1707         url_flv_el = metadata.find('url_flv')
1708         if url_flv_el is None:
1709             raise ExtractorError(u'Unable to extract download url')
1710         video_url = url_flv_el.text
1711         extension = os.path.splitext(video_url)[1][1:]
1712         title_el = metadata.find('title')
1713         if title_el is None:
1714             raise ExtractorError(u'Unable to extract title')
1715         title = title_el.text
1716         format_id_el = metadata.find('format_id')
1717         if format_id_el is None:
1718             format = ext
1719         else:
1720             format = format_id_el.text
1721         description_el = metadata.find('description')
1722         if description_el is not None:
1723             description = description_el.text
1724         else:
1725             description = None
1726         imagePreview_el = metadata.find('imagePreview')
1727         if imagePreview_el is not None:
1728             thumbnail = imagePreview_el.text
1729         else:
1730             thumbnail = None
1731         info = {
1732             'id': video_id,
1733             'url': video_url,
1734             'title': title,
1735             'ext': extension,
1736             'format': format,
1737             'thumbnail': thumbnail,
1738             'description': description
1739         }
1740         return [info]
1741
1742 class SpiegelIE(InfoExtractor):
1743     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1744
1745     def _real_extract(self, url):
1746         m = re.match(self._VALID_URL, url)
1747         video_id = m.group('videoID')
1748
1749         webpage = self._download_webpage(url, video_id)
1750
1751         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1752             webpage, u'title')
1753
1754         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1755         xml_code = self._download_webpage(xml_url, video_id,
1756                     note=u'Downloading XML', errnote=u'Failed to download XML')
1757
1758         idoc = xml.etree.ElementTree.fromstring(xml_code)
1759         last_type = idoc[-1]
1760         filename = last_type.findall('./filename')[0].text
1761         duration = float(last_type.findall('./duration')[0].text)
1762
1763         video_url = 'http://video2.spiegel.de/flash/' + filename
1764         video_ext = filename.rpartition('.')[2]
1765         info = {
1766             'id': video_id,
1767             'url': video_url,
1768             'ext': video_ext,
1769             'title': video_title,
1770             'duration': duration,
1771         }
1772         return [info]
1773
1774 class LiveLeakIE(InfoExtractor):
1775
1776     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1777     IE_NAME = u'liveleak'
1778
1779     def _real_extract(self, url):
1780         mobj = re.match(self._VALID_URL, url)
1781         if mobj is None:
1782             raise ExtractorError(u'Invalid URL: %s' % url)
1783
1784         video_id = mobj.group('video_id')
1785
1786         webpage = self._download_webpage(url, video_id)
1787
1788         video_url = self._search_regex(r'file: "(.*?)",',
1789             webpage, u'video URL')
1790
1791         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1792             webpage, u'title').replace('LiveLeak.com -', '').strip()
1793
1794         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1795             webpage, u'description', fatal=False)
1796
1797         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1798             webpage, u'uploader', fatal=False)
1799
1800         info = {
1801             'id':  video_id,
1802             'url': video_url,
1803             'ext': 'mp4',
1804             'title': video_title,
1805             'description': video_description,
1806             'uploader': video_uploader
1807         }
1808
1809         return [info]
1810
1811
1812
1813 class TumblrIE(InfoExtractor):
1814     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1815
1816     def _real_extract(self, url):
1817         m_url = re.match(self._VALID_URL, url)
1818         video_id = m_url.group('id')
1819         blog = m_url.group('blog_name')
1820
1821         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1822         webpage = self._download_webpage(url, video_id)
1823
1824         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1825         video = re.search(re_video, webpage)
1826         if video is None:
1827            raise ExtractorError(u'Unable to extract video')
1828         video_url = video.group('video_url')
1829         ext = video.group('ext')
1830
1831         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1832             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1833         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1834
1835         # The only place where you can get a title, it's not complete,
1836         # but searching in other places doesn't work for all videos
1837         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1838             webpage, u'title', flags=re.DOTALL)
1839
1840         return [{'id': video_id,
1841                  'url': video_url,
1842                  'title': video_title,
1843                  'thumbnail': video_thumbnail,
1844                  'ext': ext
1845                  }]
1846
1847 class BandcampIE(InfoExtractor):
1848     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1849
1850     def _real_extract(self, url):
1851         mobj = re.match(self._VALID_URL, url)
1852         title = mobj.group('title')
1853         webpage = self._download_webpage(url, title)
1854         # We get the link to the free download page
1855         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1856         if m_download is None:
1857             raise ExtractorError(u'No free songs found')
1858
1859         download_link = m_download.group(1)
1860         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1861                        webpage, re.MULTILINE|re.DOTALL).group('id')
1862
1863         download_webpage = self._download_webpage(download_link, id,
1864                                                   'Downloading free downloads page')
1865         # We get the dictionary of the track from some javascrip code
1866         info = re.search(r'items: (.*?),$',
1867                          download_webpage, re.MULTILINE).group(1)
1868         info = json.loads(info)[0]
1869         # We pick mp3-320 for now, until format selection can be easily implemented.
1870         mp3_info = info[u'downloads'][u'mp3-320']
1871         # If we try to use this url it says the link has expired
1872         initial_url = mp3_info[u'url']
1873         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1874         m_url = re.match(re_url, initial_url)
1875         #We build the url we will use to get the final track url
1876         # This url is build in Bandcamp in the script download_bunde_*.js
1877         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1878         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1879         # If we could correctly generate the .rand field the url would be
1880         #in the "download_url" key
1881         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1882
1883         track_info = {'id':id,
1884                       'title' : info[u'title'],
1885                       'ext' :   'mp3',
1886                       'url' :   final_url,
1887                       'thumbnail' : info[u'thumb_url'],
1888                       'uploader' :  info[u'artist']
1889                       }
1890
1891         return [track_info]
1892
1893 class RedTubeIE(InfoExtractor):
1894     """Information Extractor for redtube"""
1895     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1896
1897     def _real_extract(self,url):
1898         mobj = re.match(self._VALID_URL, url)
1899         if mobj is None:
1900             raise ExtractorError(u'Invalid URL: %s' % url)
1901
1902         video_id = mobj.group('id')
1903         video_extension = 'mp4'
1904         webpage = self._download_webpage(url, video_id)
1905
1906         self.report_extraction(video_id)
1907
1908         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1909             webpage, u'video URL')
1910
1911         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1912             webpage, u'title')
1913
1914         return [{
1915             'id':       video_id,
1916             'url':      video_url,
1917             'ext':      video_extension,
1918             'title':    video_title,
1919         }]
1920
1921 class InaIE(InfoExtractor):
1922     """Information Extractor for Ina.fr"""
1923     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1924
1925     def _real_extract(self,url):
1926         mobj = re.match(self._VALID_URL, url)
1927
1928         video_id = mobj.group('id')
1929         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1930         video_extension = 'mp4'
1931         webpage = self._download_webpage(mrss_url, video_id)
1932
1933         self.report_extraction(video_id)
1934
1935         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1936             webpage, u'video URL')
1937
1938         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1939             webpage, u'title')
1940
1941         return [{
1942             'id':       video_id,
1943             'url':      video_url,
1944             'ext':      video_extension,
1945             'title':    video_title,
1946         }]
1947
1948 class HowcastIE(InfoExtractor):
1949     """Information Extractor for Howcast.com"""
1950     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1951
1952     def _real_extract(self, url):
1953         mobj = re.match(self._VALID_URL, url)
1954
1955         video_id = mobj.group('id')
1956         webpage_url = 'http://www.howcast.com/videos/' + video_id
1957         webpage = self._download_webpage(webpage_url, video_id)
1958
1959         self.report_extraction(video_id)
1960
1961         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1962             webpage, u'video URL')
1963
1964         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1965             webpage, u'title')
1966
1967         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1968             webpage, u'description', fatal=False)
1969
1970         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1971             webpage, u'thumbnail', fatal=False)
1972
1973         return [{
1974             'id':       video_id,
1975             'url':      video_url,
1976             'ext':      'mp4',
1977             'title':    video_title,
1978             'description': video_description,
1979             'thumbnail': thumbnail,
1980         }]
1981
1982 class VineIE(InfoExtractor):
1983     """Information Extractor for Vine.co"""
1984     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1985
1986     def _real_extract(self, url):
1987         mobj = re.match(self._VALID_URL, url)
1988
1989         video_id = mobj.group('id')
1990         webpage_url = 'https://vine.co/v/' + video_id
1991         webpage = self._download_webpage(webpage_url, video_id)
1992
1993         self.report_extraction(video_id)
1994
1995         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1996             webpage, u'video URL')
1997
1998         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1999             webpage, u'title')
2000
2001         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2002             webpage, u'thumbnail', fatal=False)
2003
2004         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2005             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2006
2007         return [{
2008             'id':        video_id,
2009             'url':       video_url,
2010             'ext':       'mp4',
2011             'title':     video_title,
2012             'thumbnail': thumbnail,
2013             'uploader':  uploader,
2014         }]
2015
2016 class FlickrIE(InfoExtractor):
2017     """Information Extractor for Flickr videos"""
2018     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2019
2020     def _real_extract(self, url):
2021         mobj = re.match(self._VALID_URL, url)
2022
2023         video_id = mobj.group('id')
2024         video_uploader_id = mobj.group('uploader_id')
2025         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2026         webpage = self._download_webpage(webpage_url, video_id)
2027
2028         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2029
2030         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2031         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2032
2033         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2034             first_xml, u'node_id')
2035
2036         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2037         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2038
2039         self.report_extraction(video_id)
2040
2041         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2042         if mobj is None:
2043             raise ExtractorError(u'Unable to extract video url')
2044         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2045
2046         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2047             webpage, u'video title')
2048
2049         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2050             webpage, u'description', fatal=False)
2051
2052         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2053             webpage, u'thumbnail', fatal=False)
2054
2055         return [{
2056             'id':          video_id,
2057             'url':         video_url,
2058             'ext':         'mp4',
2059             'title':       video_title,
2060             'description': video_description,
2061             'thumbnail':   thumbnail,
2062             'uploader_id': video_uploader_id,
2063         }]
2064
2065 class TeamcocoIE(InfoExtractor):
2066     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2067
2068     def _real_extract(self, url):
2069         mobj = re.match(self._VALID_URL, url)
2070         if mobj is None:
2071             raise ExtractorError(u'Invalid URL: %s' % url)
2072         url_title = mobj.group('url_title')
2073         webpage = self._download_webpage(url, url_title)
2074
2075         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2076             webpage, u'video id')
2077
2078         self.report_extraction(video_id)
2079
2080         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2081             webpage, u'title')
2082
2083         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2084             webpage, u'thumbnail', fatal=False)
2085
2086         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2087             webpage, u'description', fatal=False)
2088
2089         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2090         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2091
2092         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2093             data, u'video URL')
2094
2095         return [{
2096             'id':          video_id,
2097             'url':         video_url,
2098             'ext':         'mp4',
2099             'title':       video_title,
2100             'thumbnail':   thumbnail,
2101             'description': video_description,
2102         }]
2103
2104 class XHamsterIE(InfoExtractor):
2105     """Information Extractor for xHamster"""
2106     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2107
2108     def _real_extract(self,url):
2109         mobj = re.match(self._VALID_URL, url)
2110
2111         video_id = mobj.group('id')
2112         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2113         webpage = self._download_webpage(mrss_url, video_id)
2114
2115         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2116         if mobj is None:
2117             raise ExtractorError(u'Unable to extract media URL')
2118         if len(mobj.group('server')) == 0:
2119             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2120         else:
2121             video_url = mobj.group('server')+'/key='+mobj.group('file')
2122         video_extension = video_url.split('.')[-1]
2123
2124         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2125             webpage, u'title')
2126
2127         # Can't see the description anywhere in the UI
2128         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2129         #     webpage, u'description', fatal=False)
2130         # if video_description: video_description = unescapeHTML(video_description)
2131
2132         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2133         if mobj:
2134             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2135         else:
2136             video_upload_date = None
2137             self._downloader.report_warning(u'Unable to extract upload date')
2138
2139         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2140             webpage, u'uploader id', default=u'anonymous')
2141
2142         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2143             webpage, u'thumbnail', fatal=False)
2144
2145         return [{
2146             'id':       video_id,
2147             'url':      video_url,
2148             'ext':      video_extension,
2149             'title':    video_title,
2150             # 'description': video_description,
2151             'upload_date': video_upload_date,
2152             'uploader_id': video_uploader_id,
2153             'thumbnail': video_thumbnail
2154         }]
2155
2156 class HypemIE(InfoExtractor):
2157     """Information Extractor for hypem"""
2158     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2159
2160     def _real_extract(self, url):
2161         mobj = re.match(self._VALID_URL, url)
2162         if mobj is None:
2163             raise ExtractorError(u'Invalid URL: %s' % url)
2164         track_id = mobj.group(1)
2165
2166         data = { 'ax': 1, 'ts': time.time() }
2167         data_encoded = compat_urllib_parse.urlencode(data)
2168         complete_url = url + "?" + data_encoded
2169         request = compat_urllib_request.Request(complete_url)
2170         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2171         cookie = urlh.headers.get('Set-Cookie', '')
2172
2173         self.report_extraction(track_id)
2174
2175         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2176             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2177         try:
2178             track_list = json.loads(html_tracks)
2179             track = track_list[u'tracks'][0]
2180         except ValueError:
2181             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2182
2183         key = track[u"key"]
2184         track_id = track[u"id"]
2185         artist = track[u"artist"]
2186         title = track[u"song"]
2187
2188         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2189         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2190         request.add_header('cookie', cookie)
2191         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2192         try:
2193             song_data = json.loads(song_data_json)
2194         except ValueError:
2195             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2196         final_url = song_data[u"url"]
2197
2198         return [{
2199             'id':       track_id,
2200             'url':      final_url,
2201             'ext':      "mp3",
2202             'title':    title,
2203             'artist':   artist,
2204         }]
2205
2206 class Vbox7IE(InfoExtractor):
2207     """Information Extractor for Vbox7"""
2208     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2209
2210     def _real_extract(self,url):
2211         mobj = re.match(self._VALID_URL, url)
2212         if mobj is None:
2213             raise ExtractorError(u'Invalid URL: %s' % url)
2214         video_id = mobj.group(1)
2215
2216         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2217         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2218         redirect_url = urlh.geturl() + new_location
2219         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2220
2221         title = self._html_search_regex(r'<title>(.*)</title>',
2222             webpage, u'title').split('/')[0].strip()
2223
2224         ext = "flv"
2225         info_url = "http://vbox7.com/play/magare.do"
2226         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2227         info_request = compat_urllib_request.Request(info_url, data)
2228         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2229         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2230         if info_response is None:
2231             raise ExtractorError(u'Unable to extract the media url')
2232         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2233
2234         return [{
2235             'id':        video_id,
2236             'url':       final_url,
2237             'ext':       ext,
2238             'title':     title,
2239             'thumbnail': thumbnail_url,
2240         }]
2241
2242
2243 def gen_extractors():
2244     """ Return a list of an instance of every supported extractor.
2245     The order does matter; the first extractor matched is the one handling the URL.
2246     """
2247     return [
2248         YoutubePlaylistIE(),
2249         YoutubeChannelIE(),
2250         YoutubeUserIE(),
2251         YoutubeSearchIE(),
2252         YoutubeIE(),
2253         MetacafeIE(),
2254         DailymotionIE(),
2255         GoogleSearchIE(),
2256         PhotobucketIE(),
2257         YahooIE(),
2258         YahooSearchIE(),
2259         DepositFilesIE(),
2260         FacebookIE(),
2261         BlipTVIE(),
2262         BlipTVUserIE(),
2263         VimeoIE(),
2264         MyVideoIE(),
2265         ComedyCentralIE(),
2266         EscapistIE(),
2267         CollegeHumorIE(),
2268         XVideosIE(),
2269         SoundcloudSetIE(),
2270         SoundcloudIE(),
2271         InfoQIE(),
2272         MixcloudIE(),
2273         StanfordOpenClassroomIE(),
2274         MTVIE(),
2275         YoukuIE(),
2276         XNXXIE(),
2277         YouJizzIE(),
2278         PornotubeIE(),
2279         YouPornIE(),
2280         GooglePlusIE(),
2281         ArteTvIE(),
2282         NBAIE(),
2283         WorldStarHipHopIE(),
2284         JustinTVIE(),
2285         FunnyOrDieIE(),
2286         SteamIE(),
2287         UstreamIE(),
2288         RBMARadioIE(),
2289         EightTracksIE(),
2290         KeekIE(),
2291         TEDIE(),
2292         MySpassIE(),
2293         SpiegelIE(),
2294         LiveLeakIE(),
2295         ARDIE(),
2296         ZDFIE(),
2297         TumblrIE(),
2298         BandcampIE(),
2299         RedTubeIE(),
2300         InaIE(),
2301         HowcastIE(),
2302         VineIE(),
2303         FlickrIE(),
2304         TeamcocoIE(),
2305         XHamsterIE(),
2306         HypemIE(),
2307         Vbox7IE(),
2308         GametrailersIE(),
2309         StatigramIE(),
2310         GenericIE()
2311     ]
2312
2313 def get_info_extractor(ie_name):
2314     """Returns the info extractor class with the given ie_name"""
2315     return globals()[ie_name+'IE']