youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.dailymotion import DailymotionIE
  26 from .extractor.depositfiles import DepositFilesIE
  27 from .extractor.facebook import FacebookIE
  28 from .extractor.gametrailers import GametrailersIE
  29 from .extractor.generic import GenericIE
  30 from .extractor.googleplus import GooglePlusIE
  31 from .extractor.googlesearch import GoogleSearchIE
  32 from .extractor.metacafe import MetacafeIE
  33 from .extractor.myvideo import MyVideoIE
  34 from .extractor.statigram import StatigramIE
  35 from .extractor.photobucket import PhotobucketIE
  36 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  37 from .extractor.vimeo import VimeoIE
  38 from .extractor.yahoo import YahooIE, YahooSearchIE
  39 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  40 from .extractor.zdf import ZDFIE
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69 class EscapistIE(InfoExtractor):
  70     """Information extractor for The Escapist """
  71
  72     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
  73     IE_NAME = u'escapist'
  74
  75     def _real_extract(self, url):
  76         mobj = re.match(self._VALID_URL, url)
  77         if mobj is None:
  78             raise ExtractorError(u'Invalid URL: %s' % url)
  79         showName = mobj.group('showname')
  80         videoId = mobj.group('episode')
  81
  82         self.report_extraction(videoId)
  83         webpage = self._download_webpage(url, videoId)
  84
  85         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
  86             webpage, u'description', fatal=False)
  87
  88         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
  89             webpage, u'thumbnail', fatal=False)
  90
  91         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
  92             webpage, u'player url')
  93
  94         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
  95             webpage, u'player url').split(' : ')[-1]
  96
  97         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
  98         configUrl = compat_urllib_parse.unquote(configUrl)
  99
 100         configJSON = self._download_webpage(configUrl, videoId,
 101                                             u'Downloading configuration',
 102                                             u'unable to download configuration')
 103
 104         # Technically, it's JavaScript, not JSON
 105         configJSON = configJSON.replace("'", '"')
 106
 107         try:
 108             config = json.loads(configJSON)
 109         except (ValueError,) as err:
 110             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 111
 112         playlist = config['playlist']
 113         videoUrl = playlist[1]['url']
 114
 115         info = {
 116             'id': videoId,
 117             'url': videoUrl,
 118             'uploader': showName,
 119             'upload_date': None,
 120             'title': title,
 121             'ext': 'mp4',
 122             'thumbnail': imgUrl,
 123             'description': videoDesc,
 124             'player_url': playerUrl,
 125         }
 126
 127         return [info]
 128
 129 class CollegeHumorIE(InfoExtractor):
 130     """Information extractor for collegehumor.com"""
 131
 132     _WORKING = False
 133     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 134     IE_NAME = u'collegehumor'
 135
 136     def report_manifest(self, video_id):
 137         """Report information extraction."""
 138         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 139
 140     def _real_extract(self, url):
 141         mobj = re.match(self._VALID_URL, url)
 142         if mobj is None:
 143             raise ExtractorError(u'Invalid URL: %s' % url)
 144         video_id = mobj.group('videoid')
 145
 146         info = {
 147             'id': video_id,
 148             'uploader': None,
 149             'upload_date': None,
 150         }
 151
 152         self.report_extraction(video_id)
 153         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 154         try:
 155             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 157             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 158
 159         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 160         try:
 161             videoNode = mdoc.findall('./video')[0]
 162             info['description'] = videoNode.findall('./description')[0].text
 163             info['title'] = videoNode.findall('./caption')[0].text
 164             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 165             manifest_url = videoNode.findall('./file')[0].text
 166         except IndexError:
 167             raise ExtractorError(u'Invalid metadata XML file')
 168
 169         manifest_url += '?hdcore=2.10.3'
 170         self.report_manifest(video_id)
 171         try:
 172             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 173         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 174             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 175
 176         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 177         try:
 178             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 179             node_id = media_node.attrib['url']
 180             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 181         except IndexError as err:
 182             raise ExtractorError(u'Invalid manifest file')
 183
 184         url_pr = compat_urllib_parse_urlparse(manifest_url)
 185         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 186
 187         info['url'] = url
 188         info['ext'] = 'f4f'
 189         return [info]
 190
 191
 192 class XVideosIE(InfoExtractor):
 193     """Information extractor for xvideos.com"""
 194
 195     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 196     IE_NAME = u'xvideos'
 197
 198     def _real_extract(self, url):
 199         mobj = re.match(self._VALID_URL, url)
 200         if mobj is None:
 201             raise ExtractorError(u'Invalid URL: %s' % url)
 202         video_id = mobj.group(1)
 203
 204         webpage = self._download_webpage(url, video_id)
 205
 206         self.report_extraction(video_id)
 207
 208         # Extract video URL
 209         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 210             webpage, u'video URL'))
 211
 212         # Extract title
 213         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 214             webpage, u'title')
 215
 216         # Extract video thumbnail
 217         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 218             webpage, u'thumbnail', fatal=False)
 219
 220         info = {
 221             'id': video_id,
 222             'url': video_url,
 223             'uploader': None,
 224             'upload_date': None,
 225             'title': video_title,
 226             'ext': 'flv',
 227             'thumbnail': video_thumbnail,
 228             'description': None,
 229         }
 230
 231         return [info]
 232
 233
 234
 235
 236 class InfoQIE(InfoExtractor):
 237     """Information extractor for infoq.com"""
 238     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 239
 240     def _real_extract(self, url):
 241         mobj = re.match(self._VALID_URL, url)
 242         if mobj is None:
 243             raise ExtractorError(u'Invalid URL: %s' % url)
 244
 245         webpage = self._download_webpage(url, video_id=url)
 246         self.report_extraction(url)
 247
 248         # Extract video URL
 249         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 250         if mobj is None:
 251             raise ExtractorError(u'Unable to extract video url')
 252         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 253         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 254
 255         # Extract title
 256         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 257             webpage, u'title')
 258
 259         # Extract description
 260         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 261             webpage, u'description', fatal=False)
 262
 263         video_filename = video_url.split('/')[-1]
 264         video_id, extension = video_filename.split('.')
 265
 266         info = {
 267             'id': video_id,
 268             'url': video_url,
 269             'uploader': None,
 270             'upload_date': None,
 271             'title': video_title,
 272             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 273             'thumbnail': None,
 274             'description': video_description,
 275         }
 276
 277         return [info]
 278
 279 class MixcloudIE(InfoExtractor):
 280     """Information extractor for www.mixcloud.com"""
 281
 282     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 283     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 284     IE_NAME = u'mixcloud'
 285
 286     def report_download_json(self, file_id):
 287         """Report JSON download."""
 288         self.to_screen(u'Downloading json')
 289
 290     def get_urls(self, jsonData, fmt, bitrate='best'):
 291         """Get urls from 'audio_formats' section in json"""
 292         file_url = None
 293         try:
 294             bitrate_list = jsonData[fmt]
 295             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 296                 bitrate = max(bitrate_list) # select highest
 297
 298             url_list = jsonData[fmt][bitrate]
 299         except TypeError: # we have no bitrate info.
 300             url_list = jsonData[fmt]
 301         return url_list
 302
 303     def check_urls(self, url_list):
 304         """Returns 1st active url from list"""
 305         for url in url_list:
 306             try:
 307                 compat_urllib_request.urlopen(url)
 308                 return url
 309             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 310                 url = None
 311
 312         return None
 313
 314     def _print_formats(self, formats):
 315         print('Available formats:')
 316         for fmt in formats.keys():
 317             for b in formats[fmt]:
 318                 try:
 319                     ext = formats[fmt][b][0]
 320                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 321                 except TypeError: # we have no bitrate info
 322                     ext = formats[fmt][0]
 323                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 324                     break
 325
 326     def _real_extract(self, url):
 327         mobj = re.match(self._VALID_URL, url)
 328         if mobj is None:
 329             raise ExtractorError(u'Invalid URL: %s' % url)
 330         # extract uploader & filename from url
 331         uploader = mobj.group(1).decode('utf-8')
 332         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 333
 334         # construct API request
 335         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 336         # retrieve .json file with links to files
 337         request = compat_urllib_request.Request(file_url)
 338         try:
 339             self.report_download_json(file_url)
 340             jsonData = compat_urllib_request.urlopen(request).read()
 341         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 342             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 343
 344         # parse JSON
 345         json_data = json.loads(jsonData)
 346         player_url = json_data['player_swf_url']
 347         formats = dict(json_data['audio_formats'])
 348
 349         req_format = self._downloader.params.get('format', None)
 350         bitrate = None
 351
 352         if self._downloader.params.get('listformats', None):
 353             self._print_formats(formats)
 354             return
 355
 356         if req_format is None or req_format == 'best':
 357             for format_param in formats.keys():
 358                 url_list = self.get_urls(formats, format_param)
 359                 # check urls
 360                 file_url = self.check_urls(url_list)
 361                 if file_url is not None:
 362                     break # got it!
 363         else:
 364             if req_format not in formats:
 365                 raise ExtractorError(u'Format is not available')
 366
 367             url_list = self.get_urls(formats, req_format)
 368             file_url = self.check_urls(url_list)
 369             format_param = req_format
 370
 371         return [{
 372             'id': file_id.decode('utf-8'),
 373             'url': file_url.decode('utf-8'),
 374             'uploader': uploader.decode('utf-8'),
 375             'upload_date': None,
 376             'title': json_data['name'],
 377             'ext': file_url.split('.')[-1].decode('utf-8'),
 378             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 379             'thumbnail': json_data['thumbnail_url'],
 380             'description': json_data['description'],
 381             'player_url': player_url.decode('utf-8'),
 382         }]
 383
 384 class StanfordOpenClassroomIE(InfoExtractor):
 385     """Information extractor for Stanford's Open ClassRoom"""
 386
 387     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 388     IE_NAME = u'stanfordoc'
 389
 390     def _real_extract(self, url):
 391         mobj = re.match(self._VALID_URL, url)
 392         if mobj is None:
 393             raise ExtractorError(u'Invalid URL: %s' % url)
 394
 395         if mobj.group('course') and mobj.group('video'): # A specific video
 396             course = mobj.group('course')
 397             video = mobj.group('video')
 398             info = {
 399                 'id': course + '_' + video,
 400                 'uploader': None,
 401                 'upload_date': None,
 402             }
 403
 404             self.report_extraction(info['id'])
 405             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 406             xmlUrl = baseUrl + video + '.xml'
 407             try:
 408                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 409             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 411             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 412             try:
 413                 info['title'] = mdoc.findall('./title')[0].text
 414                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 415             except IndexError:
 416                 raise ExtractorError(u'Invalid metadata XML file')
 417             info['ext'] = info['url'].rpartition('.')[2]
 418             return [info]
 419         elif mobj.group('course'): # A course page
 420             course = mobj.group('course')
 421             info = {
 422                 'id': course,
 423                 'type': 'playlist',
 424                 'uploader': None,
 425                 'upload_date': None,
 426             }
 427
 428             coursepage = self._download_webpage(url, info['id'],
 429                                         note='Downloading course info page',
 430                                         errnote='Unable to download course info page')
 431
 432             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 433
 434             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 435                 coursepage, u'description', fatal=False)
 436
 437             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 438             info['list'] = [
 439                 {
 440                     'type': 'reference',
 441                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 442                 }
 443                     for vpage in links]
 444             results = []
 445             for entry in info['list']:
 446                 assert entry['type'] == 'reference'
 447                 results += self.extract(entry['url'])
 448             return results
 449         else: # Root page
 450             info = {
 451                 'id': 'Stanford OpenClassroom',
 452                 'type': 'playlist',
 453                 'uploader': None,
 454                 'upload_date': None,
 455             }
 456
 457             self.report_download_webpage(info['id'])
 458             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 459             try:
 460                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 461             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 463
 464             info['title'] = info['id']
 465
 466             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 467             info['list'] = [
 468                 {
 469                     'type': 'reference',
 470                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 471                 }
 472                     for cpage in links]
 473
 474             results = []
 475             for entry in info['list']:
 476                 assert entry['type'] == 'reference'
 477                 results += self.extract(entry['url'])
 478             return results
 479
 480 class MTVIE(InfoExtractor):
 481     """Information extractor for MTV.com"""
 482
 483     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 484     IE_NAME = u'mtv'
 485
 486     def _real_extract(self, url):
 487         mobj = re.match(self._VALID_URL, url)
 488         if mobj is None:
 489             raise ExtractorError(u'Invalid URL: %s' % url)
 490         if not mobj.group('proto'):
 491             url = 'http://' + url
 492         video_id = mobj.group('videoid')
 493
 494         webpage = self._download_webpage(url, video_id)
 495
 496         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 497             webpage, u'song name', fatal=False)
 498
 499         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 500             webpage, u'title')
 501
 502         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 503             webpage, u'mtvn_uri', fatal=False)
 504
 505         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 506             webpage, u'content id', fatal=False)
 507
 508         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 509         self.report_extraction(video_id)
 510         request = compat_urllib_request.Request(videogen_url)
 511         try:
 512             metadataXml = compat_urllib_request.urlopen(request).read()
 513         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 514             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 515
 516         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 517         renditions = mdoc.findall('.//rendition')
 518
 519         # For now, always pick the highest quality.
 520         rendition = renditions[-1]
 521
 522         try:
 523             _,_,ext = rendition.attrib['type'].partition('/')
 524             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 525             video_url = rendition.find('./src').text
 526         except KeyError:
 527             raise ExtractorError('Invalid rendition field.')
 528
 529         info = {
 530             'id': video_id,
 531             'url': video_url,
 532             'uploader': performer,
 533             'upload_date': None,
 534             'title': video_title,
 535             'ext': ext,
 536             'format': format,
 537         }
 538
 539         return [info]
 540
 541
 542 class YoukuIE(InfoExtractor):
 543     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 544
 545     def _gen_sid(self):
 546         nowTime = int(time.time() * 1000)
 547         random1 = random.randint(1000,1998)
 548         random2 = random.randint(1000,9999)
 549
 550         return "%d%d%d" %(nowTime,random1,random2)
 551
 552     def _get_file_ID_mix_string(self, seed):
 553         mixed = []
 554         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 555         seed = float(seed)
 556         for i in range(len(source)):
 557             seed  =  (seed * 211 + 30031 ) % 65536
 558             index  =  math.floor(seed / 65536 * len(source) )
 559             mixed.append(source[int(index)])
 560             source.remove(source[int(index)])
 561         #return ''.join(mixed)
 562         return mixed
 563
 564     def _get_file_id(self, fileId, seed):
 565         mixed = self._get_file_ID_mix_string(seed)
 566         ids = fileId.split('*')
 567         realId = []
 568         for ch in ids:
 569             if ch:
 570                 realId.append(mixed[int(ch)])
 571         return ''.join(realId)
 572
 573     def _real_extract(self, url):
 574         mobj = re.match(self._VALID_URL, url)
 575         if mobj is None:
 576             raise ExtractorError(u'Invalid URL: %s' % url)
 577         video_id = mobj.group('ID')
 578
 579         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 580
 581         jsondata = self._download_webpage(info_url, video_id)
 582
 583         self.report_extraction(video_id)
 584         try:
 585             config = json.loads(jsondata)
 586
 587             video_title =  config['data'][0]['title']
 588             seed = config['data'][0]['seed']
 589
 590             format = self._downloader.params.get('format', None)
 591             supported_format = list(config['data'][0]['streamfileids'].keys())
 592
 593             if format is None or format == 'best':
 594                 if 'hd2' in supported_format:
 595                     format = 'hd2'
 596                 else:
 597                     format = 'flv'
 598                 ext = u'flv'
 599             elif format == 'worst':
 600                 format = 'mp4'
 601                 ext = u'mp4'
 602             else:
 603                 format = 'flv'
 604                 ext = u'flv'
 605
 606
 607             fileid = config['data'][0]['streamfileids'][format]
 608             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 609         except (UnicodeDecodeError, ValueError, KeyError):
 610             raise ExtractorError(u'Unable to extract info section')
 611
 612         files_info=[]
 613         sid = self._gen_sid()
 614         fileid = self._get_file_id(fileid, seed)
 615
 616         #column 8,9 of fileid represent the segment number
 617         #fileid[7:9] should be changed
 618         for index, key in enumerate(keys):
 619
 620             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 621             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 622
 623             info = {
 624                 'id': '%s_part%02d' % (video_id, index),
 625                 'url': download_url,
 626                 'uploader': None,
 627                 'upload_date': None,
 628                 'title': video_title,
 629                 'ext': ext,
 630             }
 631             files_info.append(info)
 632
 633         return files_info
 634
 635
 636 class XNXXIE(InfoExtractor):
 637     """Information extractor for xnxx.com"""
 638
 639     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 640     IE_NAME = u'xnxx'
 641     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 642     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 643     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 644
 645     def _real_extract(self, url):
 646         mobj = re.match(self._VALID_URL, url)
 647         if mobj is None:
 648             raise ExtractorError(u'Invalid URL: %s' % url)
 649         video_id = mobj.group(1)
 650
 651         # Get webpage content
 652         webpage = self._download_webpage(url, video_id)
 653
 654         video_url = self._search_regex(self.VIDEO_URL_RE,
 655             webpage, u'video URL')
 656         video_url = compat_urllib_parse.unquote(video_url)
 657
 658         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 659             webpage, u'title')
 660
 661         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 662             webpage, u'thumbnail', fatal=False)
 663
 664         return [{
 665             'id': video_id,
 666             'url': video_url,
 667             'uploader': None,
 668             'upload_date': None,
 669             'title': video_title,
 670             'ext': 'flv',
 671             'thumbnail': video_thumbnail,
 672             'description': None,
 673         }]
 674
 675
 676
 677 class NBAIE(InfoExtractor):
 678     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 679     IE_NAME = u'nba'
 680
 681     def _real_extract(self, url):
 682         mobj = re.match(self._VALID_URL, url)
 683         if mobj is None:
 684             raise ExtractorError(u'Invalid URL: %s' % url)
 685
 686         video_id = mobj.group(1)
 687
 688         webpage = self._download_webpage(url, video_id)
 689
 690         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 691
 692         shortened_video_id = video_id.rpartition('/')[2]
 693         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 694             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 695
 696         # It isn't there in the HTML it returns to us
 697         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 698
 699         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 700
 701         info = {
 702             'id': shortened_video_id,
 703             'url': video_url,
 704             'ext': 'mp4',
 705             'title': title,
 706             # 'uploader_date': uploader_date,
 707             'description': description,
 708         }
 709         return [info]
 710
 711 class JustinTVIE(InfoExtractor):
 712     """Information extractor for justin.tv and twitch.tv"""
 713     # TODO: One broadcast may be split into multiple videos. The key
 714     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 715     # starts at 1 and increases. Can we treat all parts as one video?
 716
 717     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 718         (?:
 719             (?P<channelid>[^/]+)|
 720             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 721             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 722         )
 723         /?(?:\#.*)?$
 724         """
 725     _JUSTIN_PAGE_LIMIT = 100
 726     IE_NAME = u'justin.tv'
 727
 728     def report_download_page(self, channel, offset):
 729         """Report attempt to download a single page of videos."""
 730         self.to_screen(u'%s: Downloading video information from %d to %d' %
 731                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 732
 733     # Return count of items, list of *valid* items
 734     def _parse_page(self, url, video_id):
 735         webpage = self._download_webpage(url, video_id,
 736                                          u'Downloading video info JSON',
 737                                          u'unable to download video info JSON')
 738
 739         response = json.loads(webpage)
 740         if type(response) != list:
 741             error_text = response.get('error', 'unknown error')
 742             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 743         info = []
 744         for clip in response:
 745             video_url = clip['video_file_url']
 746             if video_url:
 747                 video_extension = os.path.splitext(video_url)[1][1:]
 748                 video_date = re.sub('-', '', clip['start_time'][:10])
 749                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 750                 video_id = clip['id']
 751                 video_title = clip.get('title', video_id)
 752                 info.append({
 753                     'id': video_id,
 754                     'url': video_url,
 755                     'title': video_title,
 756                     'uploader': clip.get('channel_name', video_uploader_id),
 757                     'uploader_id': video_uploader_id,
 758                     'upload_date': video_date,
 759                     'ext': video_extension,
 760                 })
 761         return (len(response), info)
 762
 763     def _real_extract(self, url):
 764         mobj = re.match(self._VALID_URL, url)
 765         if mobj is None:
 766             raise ExtractorError(u'invalid URL: %s' % url)
 767
 768         api_base = 'http://api.justin.tv'
 769         paged = False
 770         if mobj.group('channelid'):
 771             paged = True
 772             video_id = mobj.group('channelid')
 773             api = api_base + '/channel/archives/%s.json' % video_id
 774         elif mobj.group('chapterid'):
 775             chapter_id = mobj.group('chapterid')
 776
 777             webpage = self._download_webpage(url, chapter_id)
 778             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 779             if not m:
 780                 raise ExtractorError(u'Cannot find archive of a chapter')
 781             archive_id = m.group(1)
 782
 783             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 784             chapter_info_xml = self._download_webpage(api, chapter_id,
 785                                              note=u'Downloading chapter information',
 786                                              errnote=u'Chapter information download failed')
 787             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 788             for a in doc.findall('.//archive'):
 789                 if archive_id == a.find('./id').text:
 790                     break
 791             else:
 792                 raise ExtractorError(u'Could not find chapter in chapter information')
 793
 794             video_url = a.find('./video_file_url').text
 795             video_ext = video_url.rpartition('.')[2] or u'flv'
 796
 797             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 798             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 799                                    note='Downloading chapter metadata',
 800                                    errnote='Download of chapter metadata failed')
 801             chapter_info = json.loads(chapter_info_json)
 802
 803             bracket_start = int(doc.find('.//bracket_start').text)
 804             bracket_end = int(doc.find('.//bracket_end').text)
 805
 806             # TODO determine start (and probably fix up file)
 807             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 808             #video_url += u'?start=' + TODO:start_timestamp
 809             # bracket_start is 13290, but we want 51670615
 810             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 811                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 812
 813             info = {
 814                 'id': u'c' + chapter_id,
 815                 'url': video_url,
 816                 'ext': video_ext,
 817                 'title': chapter_info['title'],
 818                 'thumbnail': chapter_info['preview'],
 819                 'description': chapter_info['description'],
 820                 'uploader': chapter_info['channel']['display_name'],
 821                 'uploader_id': chapter_info['channel']['name'],
 822             }
 823             return [info]
 824         else:
 825             video_id = mobj.group('videoid')
 826             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 827
 828         self.report_extraction(video_id)
 829
 830         info = []
 831         offset = 0
 832         limit = self._JUSTIN_PAGE_LIMIT
 833         while True:
 834             if paged:
 835                 self.report_download_page(video_id, offset)
 836             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 837             page_count, page_info = self._parse_page(page_url, video_id)
 838             info.extend(page_info)
 839             if not paged or page_count != limit:
 840                 break
 841             offset += limit
 842         return info
 843
 844 class FunnyOrDieIE(InfoExtractor):
 845     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 846
 847     def _real_extract(self, url):
 848         mobj = re.match(self._VALID_URL, url)
 849         if mobj is None:
 850             raise ExtractorError(u'invalid URL: %s' % url)
 851
 852         video_id = mobj.group('id')
 853         webpage = self._download_webpage(url, video_id)
 854
 855         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 856             webpage, u'video URL', flags=re.DOTALL)
 857
 858         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 859             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 860
 861         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 862             webpage, u'description', fatal=False, flags=re.DOTALL)
 863
 864         info = {
 865             'id': video_id,
 866             'url': video_url,
 867             'ext': 'mp4',
 868             'title': title,
 869             'description': video_description,
 870         }
 871         return [info]
 872
 873 class SteamIE(InfoExtractor):
 874     _VALID_URL = r"""http://store\.steampowered\.com/
 875                 (agecheck/)?
 876                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 877                 (?P<gameID>\d+)/?
 878                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 879                 """
 880     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 881     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 882
 883     @classmethod
 884     def suitable(cls, url):
 885         """Receives a URL and returns True if suitable for this IE."""
 886         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 887
 888     def _real_extract(self, url):
 889         m = re.match(self._VALID_URL, url, re.VERBOSE)
 890         gameID = m.group('gameID')
 891
 892         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 893         webpage = self._download_webpage(videourl, gameID)
 894
 895         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 896             videourl = self._AGECHECK_TEMPLATE % gameID
 897             self.report_age_confirmation()
 898             webpage = self._download_webpage(videourl, gameID)
 899
 900         self.report_extraction(gameID)
 901         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 902                                              webpage, 'game title')
 903
 904         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 905         mweb = re.finditer(urlRE, webpage)
 906         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 907         titles = re.finditer(namesRE, webpage)
 908         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 909         thumbs = re.finditer(thumbsRE, webpage)
 910         videos = []
 911         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 912             video_id = vid.group('videoID')
 913             title = vtitle.group('videoName')
 914             video_url = vid.group('videoURL')
 915             video_thumb = thumb.group('thumbnail')
 916             if not video_url:
 917                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 918             info = {
 919                 'id':video_id,
 920                 'url':video_url,
 921                 'ext': 'flv',
 922                 'title': unescapeHTML(title),
 923                 'thumbnail': video_thumb
 924                   }
 925             videos.append(info)
 926         return [self.playlist_result(videos, gameID, game_title)]
 927
 928 class UstreamIE(InfoExtractor):
 929     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 930     IE_NAME = u'ustream'
 931
 932     def _real_extract(self, url):
 933         m = re.match(self._VALID_URL, url)
 934         video_id = m.group('videoID')
 935
 936         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 937         webpage = self._download_webpage(url, video_id)
 938
 939         self.report_extraction(video_id)
 940
 941         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 942             webpage, u'title')
 943
 944         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 945             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 946
 947         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 948             webpage, u'thumbnail', fatal=False)
 949
 950         info = {
 951                 'id': video_id,
 952                 'url': video_url,
 953                 'ext': 'flv',
 954                 'title': video_title,
 955                 'uploader': uploader,
 956                 'thumbnail': thumbnail,
 957                }
 958         return info
 959
 960 class WorldStarHipHopIE(InfoExtractor):
 961     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 962     IE_NAME = u'WorldStarHipHop'
 963
 964     def _real_extract(self, url):
 965         m = re.match(self._VALID_URL, url)
 966         video_id = m.group('id')
 967
 968         webpage_src = self._download_webpage(url, video_id)
 969
 970         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 971             webpage_src, u'video URL')
 972
 973         if 'mp4' in video_url:
 974             ext = 'mp4'
 975         else:
 976             ext = 'flv'
 977
 978         video_title = self._html_search_regex(r"<title>(.*)</title>",
 979             webpage_src, u'title')
 980
 981         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 982         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 983             webpage_src, u'thumbnail', fatal=False)
 984
 985         if not thumbnail:
 986             _title = r"""candytitles.*>(.*)</span>"""
 987             mobj = re.search(_title, webpage_src)
 988             if mobj is not None:
 989                 video_title = mobj.group(1)
 990
 991         results = [{
 992                     'id': video_id,
 993                     'url' : video_url,
 994                     'title' : video_title,
 995                     'thumbnail' : thumbnail,
 996                     'ext' : ext,
 997                     }]
 998         return results
 999
1000 class RBMARadioIE(InfoExtractor):
1001     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1002
1003     def _real_extract(self, url):
1004         m = re.match(self._VALID_URL, url)
1005         video_id = m.group('videoID')
1006
1007         webpage = self._download_webpage(url, video_id)
1008
1009         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1010             webpage, u'json data', flags=re.MULTILINE)
1011
1012         try:
1013             data = json.loads(json_data)
1014         except ValueError as e:
1015             raise ExtractorError(u'Invalid JSON: ' + str(e))
1016
1017         video_url = data['akamai_url'] + '&cbr=256'
1018         url_parts = compat_urllib_parse_urlparse(video_url)
1019         video_ext = url_parts.path.rpartition('.')[2]
1020         info = {
1021                 'id': video_id,
1022                 'url': video_url,
1023                 'ext': video_ext,
1024                 'title': data['title'],
1025                 'description': data.get('teaser_text'),
1026                 'location': data.get('country_of_origin'),
1027                 'uploader': data.get('host', {}).get('name'),
1028                 'uploader_id': data.get('host', {}).get('slug'),
1029                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1030                 'duration': data.get('duration'),
1031         }
1032         return [info]
1033
1034
1035 class YouPornIE(InfoExtractor):
1036     """Information extractor for youporn.com."""
1037     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1038
1039     def _print_formats(self, formats):
1040         """Print all available formats"""
1041         print(u'Available formats:')
1042         print(u'ext\t\tformat')
1043         print(u'---------------------------------')
1044         for format in formats:
1045             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1046
1047     def _specific(self, req_format, formats):
1048         for x in formats:
1049             if(x["format"]==req_format):
1050                 return x
1051         return None
1052
1053     def _real_extract(self, url):
1054         mobj = re.match(self._VALID_URL, url)
1055         if mobj is None:
1056             raise ExtractorError(u'Invalid URL: %s' % url)
1057         video_id = mobj.group('videoid')
1058
1059         req = compat_urllib_request.Request(url)
1060         req.add_header('Cookie', 'age_verified=1')
1061         webpage = self._download_webpage(req, video_id)
1062
1063         # Get JSON parameters
1064         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1065         try:
1066             params = json.loads(json_params)
1067         except:
1068             raise ExtractorError(u'Invalid JSON')
1069
1070         self.report_extraction(video_id)
1071         try:
1072             video_title = params['title']
1073             upload_date = unified_strdate(params['release_date_f'])
1074             video_description = params['description']
1075             video_uploader = params['submitted_by']
1076             thumbnail = params['thumbnails'][0]['image']
1077         except KeyError:
1078             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1079
1080         # Get all of the formats available
1081         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1082         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1083             webpage, u'download list').strip()
1084
1085         # Get all of the links from the page
1086         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1087         links = re.findall(LINK_RE, download_list_html)
1088         if(len(links) == 0):
1089             raise ExtractorError(u'ERROR: no known formats available for video')
1090
1091         self.to_screen(u'Links found: %d' % len(links))
1092
1093         formats = []
1094         for link in links:
1095
1096             # A link looks like this:
1097             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1098             # A path looks like this:
1099             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1100             video_url = unescapeHTML( link )
1101             path = compat_urllib_parse_urlparse( video_url ).path
1102             extension = os.path.splitext( path )[1][1:]
1103             format = path.split('/')[4].split('_')[:2]
1104             size = format[0]
1105             bitrate = format[1]
1106             format = "-".join( format )
1107             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1108
1109             formats.append({
1110                 'id': video_id,
1111                 'url': video_url,
1112                 'uploader': video_uploader,
1113                 'upload_date': upload_date,
1114                 'title': video_title,
1115                 'ext': extension,
1116                 'format': format,
1117                 'thumbnail': thumbnail,
1118                 'description': video_description
1119             })
1120
1121         if self._downloader.params.get('listformats', None):
1122             self._print_formats(formats)
1123             return
1124
1125         req_format = self._downloader.params.get('format', None)
1126         self.to_screen(u'Format: %s' % req_format)
1127
1128         if req_format is None or req_format == 'best':
1129             return [formats[0]]
1130         elif req_format == 'worst':
1131             return [formats[-1]]
1132         elif req_format in ('-1', 'all'):
1133             return formats
1134         else:
1135             format = self._specific( req_format, formats )
1136             if result is None:
1137                 raise ExtractorError(u'Requested format not available')
1138             return [format]
1139
1140
1141
1142 class PornotubeIE(InfoExtractor):
1143     """Information extractor for pornotube.com."""
1144     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1145
1146     def _real_extract(self, url):
1147         mobj = re.match(self._VALID_URL, url)
1148         if mobj is None:
1149             raise ExtractorError(u'Invalid URL: %s' % url)
1150
1151         video_id = mobj.group('videoid')
1152         video_title = mobj.group('title')
1153
1154         # Get webpage content
1155         webpage = self._download_webpage(url, video_id)
1156
1157         # Get the video URL
1158         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1159         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1160         video_url = compat_urllib_parse.unquote(video_url)
1161
1162         #Get the uploaded date
1163         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1164         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1165         if upload_date: upload_date = unified_strdate(upload_date)
1166
1167         info = {'id': video_id,
1168                 'url': video_url,
1169                 'uploader': None,
1170                 'upload_date': upload_date,
1171                 'title': video_title,
1172                 'ext': 'flv',
1173                 'format': 'flv'}
1174
1175         return [info]
1176
1177 class YouJizzIE(InfoExtractor):
1178     """Information extractor for youjizz.com."""
1179     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1180
1181     def _real_extract(self, url):
1182         mobj = re.match(self._VALID_URL, url)
1183         if mobj is None:
1184             raise ExtractorError(u'Invalid URL: %s' % url)
1185
1186         video_id = mobj.group('videoid')
1187
1188         # Get webpage content
1189         webpage = self._download_webpage(url, video_id)
1190
1191         # Get the video title
1192         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1193             webpage, u'title').strip()
1194
1195         # Get the embed page
1196         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1197         if result is None:
1198             raise ExtractorError(u'ERROR: unable to extract embed page')
1199
1200         embed_page_url = result.group(0).strip()
1201         video_id = result.group('videoid')
1202
1203         webpage = self._download_webpage(embed_page_url, video_id)
1204
1205         # Get the video URL
1206         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1207             webpage, u'video URL')
1208
1209         info = {'id': video_id,
1210                 'url': video_url,
1211                 'title': video_title,
1212                 'ext': 'flv',
1213                 'format': 'flv',
1214                 'player_url': embed_page_url}
1215
1216         return [info]
1217
1218 class EightTracksIE(InfoExtractor):
1219     IE_NAME = '8tracks'
1220     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1221
1222     def _real_extract(self, url):
1223         mobj = re.match(self._VALID_URL, url)
1224         if mobj is None:
1225             raise ExtractorError(u'Invalid URL: %s' % url)
1226         playlist_id = mobj.group('id')
1227
1228         webpage = self._download_webpage(url, playlist_id)
1229
1230         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1231         data = json.loads(json_like)
1232
1233         session = str(random.randint(0, 1000000000))
1234         mix_id = data['id']
1235         track_count = data['tracks_count']
1236         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1237         next_url = first_url
1238         res = []
1239         for i in itertools.count():
1240             api_json = self._download_webpage(next_url, playlist_id,
1241                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1242                 errnote=u'Failed to download song information')
1243             api_data = json.loads(api_json)
1244             track_data = api_data[u'set']['track']
1245             info = {
1246                 'id': track_data['id'],
1247                 'url': track_data['track_file_stream_url'],
1248                 'title': track_data['performer'] + u' - ' + track_data['name'],
1249                 'raw_title': track_data['name'],
1250                 'uploader_id': data['user']['login'],
1251                 'ext': 'm4a',
1252             }
1253             res.append(info)
1254             if api_data['set']['at_last_track']:
1255                 break
1256             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1257         return res
1258
1259 class KeekIE(InfoExtractor):
1260     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1261     IE_NAME = u'keek'
1262
1263     def _real_extract(self, url):
1264         m = re.match(self._VALID_URL, url)
1265         video_id = m.group('videoID')
1266
1267         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1268         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1269         webpage = self._download_webpage(url, video_id)
1270
1271         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1272             webpage, u'title')
1273
1274         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1275             webpage, u'uploader', fatal=False)
1276
1277         info = {
1278                 'id': video_id,
1279                 'url': video_url,
1280                 'ext': 'mp4',
1281                 'title': video_title,
1282                 'thumbnail': thumbnail,
1283                 'uploader': uploader
1284         }
1285         return [info]
1286
1287 class TEDIE(InfoExtractor):
1288     _VALID_URL=r'''http://www\.ted\.com/
1289                    (
1290                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1291                         |
1292                         ((?P<type_talk>talks)) # We have a simple talk
1293                    )
1294                    (/lang/(.*?))? # The url may contain the language
1295                    /(?P<name>\w+) # Here goes the name and then ".html"
1296                    '''
1297
1298     @classmethod
1299     def suitable(cls, url):
1300         """Receives a URL and returns True if suitable for this IE."""
1301         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1302
1303     def _real_extract(self, url):
1304         m=re.match(self._VALID_URL, url, re.VERBOSE)
1305         if m.group('type_talk'):
1306             return [self._talk_info(url)]
1307         else :
1308             playlist_id=m.group('playlist_id')
1309             name=m.group('name')
1310             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1311             return [self._playlist_videos_info(url,name,playlist_id)]
1312
1313     def _playlist_videos_info(self,url,name,playlist_id=0):
1314         '''Returns the videos of the playlist'''
1315         video_RE=r'''
1316                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1317                      ([.\s]*?)data-playlist_item_id="(\d+)"
1318                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1319                      '''
1320         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1321         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1322         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1323         m_names=re.finditer(video_name_RE,webpage)
1324
1325         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1326                                                  webpage, 'playlist title')
1327
1328         playlist_entries = []
1329         for m_video, m_name in zip(m_videos,m_names):
1330             video_id=m_video.group('video_id')
1331             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1332             playlist_entries.append(self.url_result(talk_url, 'TED'))
1333         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1334
1335     def _talk_info(self, url, video_id=0):
1336         """Return the video for the talk in the url"""
1337         m = re.match(self._VALID_URL, url,re.VERBOSE)
1338         video_name = m.group('name')
1339         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1340         self.report_extraction(video_name)
1341         # If the url includes the language we get the title translated
1342         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1343                                         webpage, 'title')
1344         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1345                                     webpage, 'json data')
1346         info = json.loads(json_data)
1347         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1348                                        webpage, 'description', flags = re.DOTALL)
1349
1350         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1351                                        webpage, 'thumbnail')
1352         info = {
1353                 'id': info['id'],
1354                 'url': info['htmlStreams'][-1]['file'],
1355                 'ext': 'mp4',
1356                 'title': title,
1357                 'thumbnail': thumbnail,
1358                 'description': desc,
1359                 }
1360         return info
1361
1362 class MySpassIE(InfoExtractor):
1363     _VALID_URL = r'http://www.myspass.de/.*'
1364
1365     def _real_extract(self, url):
1366         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1367
1368         # video id is the last path element of the URL
1369         # usually there is a trailing slash, so also try the second but last
1370         url_path = compat_urllib_parse_urlparse(url).path
1371         url_parent_path, video_id = os.path.split(url_path)
1372         if not video_id:
1373             _, video_id = os.path.split(url_parent_path)
1374
1375         # get metadata
1376         metadata_url = META_DATA_URL_TEMPLATE % video_id
1377         metadata_text = self._download_webpage(metadata_url, video_id)
1378         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1379
1380         # extract values from metadata
1381         url_flv_el = metadata.find('url_flv')
1382         if url_flv_el is None:
1383             raise ExtractorError(u'Unable to extract download url')
1384         video_url = url_flv_el.text
1385         extension = os.path.splitext(video_url)[1][1:]
1386         title_el = metadata.find('title')
1387         if title_el is None:
1388             raise ExtractorError(u'Unable to extract title')
1389         title = title_el.text
1390         format_id_el = metadata.find('format_id')
1391         if format_id_el is None:
1392             format = ext
1393         else:
1394             format = format_id_el.text
1395         description_el = metadata.find('description')
1396         if description_el is not None:
1397             description = description_el.text
1398         else:
1399             description = None
1400         imagePreview_el = metadata.find('imagePreview')
1401         if imagePreview_el is not None:
1402             thumbnail = imagePreview_el.text
1403         else:
1404             thumbnail = None
1405         info = {
1406             'id': video_id,
1407             'url': video_url,
1408             'title': title,
1409             'ext': extension,
1410             'format': format,
1411             'thumbnail': thumbnail,
1412             'description': description
1413         }
1414         return [info]
1415
1416 class SpiegelIE(InfoExtractor):
1417     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1418
1419     def _real_extract(self, url):
1420         m = re.match(self._VALID_URL, url)
1421         video_id = m.group('videoID')
1422
1423         webpage = self._download_webpage(url, video_id)
1424
1425         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1426             webpage, u'title')
1427
1428         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1429         xml_code = self._download_webpage(xml_url, video_id,
1430                     note=u'Downloading XML', errnote=u'Failed to download XML')
1431
1432         idoc = xml.etree.ElementTree.fromstring(xml_code)
1433         last_type = idoc[-1]
1434         filename = last_type.findall('./filename')[0].text
1435         duration = float(last_type.findall('./duration')[0].text)
1436
1437         video_url = 'http://video2.spiegel.de/flash/' + filename
1438         video_ext = filename.rpartition('.')[2]
1439         info = {
1440             'id': video_id,
1441             'url': video_url,
1442             'ext': video_ext,
1443             'title': video_title,
1444             'duration': duration,
1445         }
1446         return [info]
1447
1448 class LiveLeakIE(InfoExtractor):
1449
1450     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1451     IE_NAME = u'liveleak'
1452
1453     def _real_extract(self, url):
1454         mobj = re.match(self._VALID_URL, url)
1455         if mobj is None:
1456             raise ExtractorError(u'Invalid URL: %s' % url)
1457
1458         video_id = mobj.group('video_id')
1459
1460         webpage = self._download_webpage(url, video_id)
1461
1462         video_url = self._search_regex(r'file: "(.*?)",',
1463             webpage, u'video URL')
1464
1465         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1466             webpage, u'title').replace('LiveLeak.com -', '').strip()
1467
1468         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1469             webpage, u'description', fatal=False)
1470
1471         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1472             webpage, u'uploader', fatal=False)
1473
1474         info = {
1475             'id':  video_id,
1476             'url': video_url,
1477             'ext': 'mp4',
1478             'title': video_title,
1479             'description': video_description,
1480             'uploader': video_uploader
1481         }
1482
1483         return [info]
1484
1485
1486
1487 class TumblrIE(InfoExtractor):
1488     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1489
1490     def _real_extract(self, url):
1491         m_url = re.match(self._VALID_URL, url)
1492         video_id = m_url.group('id')
1493         blog = m_url.group('blog_name')
1494
1495         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1496         webpage = self._download_webpage(url, video_id)
1497
1498         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1499         video = re.search(re_video, webpage)
1500         if video is None:
1501            raise ExtractorError(u'Unable to extract video')
1502         video_url = video.group('video_url')
1503         ext = video.group('ext')
1504
1505         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1506             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1507         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1508
1509         # The only place where you can get a title, it's not complete,
1510         # but searching in other places doesn't work for all videos
1511         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1512             webpage, u'title', flags=re.DOTALL)
1513
1514         return [{'id': video_id,
1515                  'url': video_url,
1516                  'title': video_title,
1517                  'thumbnail': video_thumbnail,
1518                  'ext': ext
1519                  }]
1520
1521 class BandcampIE(InfoExtractor):
1522     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1523
1524     def _real_extract(self, url):
1525         mobj = re.match(self._VALID_URL, url)
1526         title = mobj.group('title')
1527         webpage = self._download_webpage(url, title)
1528         # We get the link to the free download page
1529         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1530         if m_download is None:
1531             raise ExtractorError(u'No free songs found')
1532
1533         download_link = m_download.group(1)
1534         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1535                        webpage, re.MULTILINE|re.DOTALL).group('id')
1536
1537         download_webpage = self._download_webpage(download_link, id,
1538                                                   'Downloading free downloads page')
1539         # We get the dictionary of the track from some javascrip code
1540         info = re.search(r'items: (.*?),$',
1541                          download_webpage, re.MULTILINE).group(1)
1542         info = json.loads(info)[0]
1543         # We pick mp3-320 for now, until format selection can be easily implemented.
1544         mp3_info = info[u'downloads'][u'mp3-320']
1545         # If we try to use this url it says the link has expired
1546         initial_url = mp3_info[u'url']
1547         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1548         m_url = re.match(re_url, initial_url)
1549         #We build the url we will use to get the final track url
1550         # This url is build in Bandcamp in the script download_bunde_*.js
1551         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1552         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1553         # If we could correctly generate the .rand field the url would be
1554         #in the "download_url" key
1555         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1556
1557         track_info = {'id':id,
1558                       'title' : info[u'title'],
1559                       'ext' :   'mp3',
1560                       'url' :   final_url,
1561                       'thumbnail' : info[u'thumb_url'],
1562                       'uploader' :  info[u'artist']
1563                       }
1564
1565         return [track_info]
1566
1567 class RedTubeIE(InfoExtractor):
1568     """Information Extractor for redtube"""
1569     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1570
1571     def _real_extract(self,url):
1572         mobj = re.match(self._VALID_URL, url)
1573         if mobj is None:
1574             raise ExtractorError(u'Invalid URL: %s' % url)
1575
1576         video_id = mobj.group('id')
1577         video_extension = 'mp4'
1578         webpage = self._download_webpage(url, video_id)
1579
1580         self.report_extraction(video_id)
1581
1582         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1583             webpage, u'video URL')
1584
1585         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1586             webpage, u'title')
1587
1588         return [{
1589             'id':       video_id,
1590             'url':      video_url,
1591             'ext':      video_extension,
1592             'title':    video_title,
1593         }]
1594
1595 class InaIE(InfoExtractor):
1596     """Information Extractor for Ina.fr"""
1597     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1598
1599     def _real_extract(self,url):
1600         mobj = re.match(self._VALID_URL, url)
1601
1602         video_id = mobj.group('id')
1603         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1604         video_extension = 'mp4'
1605         webpage = self._download_webpage(mrss_url, video_id)
1606
1607         self.report_extraction(video_id)
1608
1609         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1610             webpage, u'video URL')
1611
1612         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1613             webpage, u'title')
1614
1615         return [{
1616             'id':       video_id,
1617             'url':      video_url,
1618             'ext':      video_extension,
1619             'title':    video_title,
1620         }]
1621
1622 class HowcastIE(InfoExtractor):
1623     """Information Extractor for Howcast.com"""
1624     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1625
1626     def _real_extract(self, url):
1627         mobj = re.match(self._VALID_URL, url)
1628
1629         video_id = mobj.group('id')
1630         webpage_url = 'http://www.howcast.com/videos/' + video_id
1631         webpage = self._download_webpage(webpage_url, video_id)
1632
1633         self.report_extraction(video_id)
1634
1635         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1636             webpage, u'video URL')
1637
1638         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1639             webpage, u'title')
1640
1641         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1642             webpage, u'description', fatal=False)
1643
1644         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1645             webpage, u'thumbnail', fatal=False)
1646
1647         return [{
1648             'id':       video_id,
1649             'url':      video_url,
1650             'ext':      'mp4',
1651             'title':    video_title,
1652             'description': video_description,
1653             'thumbnail': thumbnail,
1654         }]
1655
1656 class VineIE(InfoExtractor):
1657     """Information Extractor for Vine.co"""
1658     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1659
1660     def _real_extract(self, url):
1661         mobj = re.match(self._VALID_URL, url)
1662
1663         video_id = mobj.group('id')
1664         webpage_url = 'https://vine.co/v/' + video_id
1665         webpage = self._download_webpage(webpage_url, video_id)
1666
1667         self.report_extraction(video_id)
1668
1669         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1670             webpage, u'video URL')
1671
1672         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1673             webpage, u'title')
1674
1675         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1676             webpage, u'thumbnail', fatal=False)
1677
1678         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1679             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1680
1681         return [{
1682             'id':        video_id,
1683             'url':       video_url,
1684             'ext':       'mp4',
1685             'title':     video_title,
1686             'thumbnail': thumbnail,
1687             'uploader':  uploader,
1688         }]
1689
1690 class FlickrIE(InfoExtractor):
1691     """Information Extractor for Flickr videos"""
1692     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1693
1694     def _real_extract(self, url):
1695         mobj = re.match(self._VALID_URL, url)
1696
1697         video_id = mobj.group('id')
1698         video_uploader_id = mobj.group('uploader_id')
1699         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1700         webpage = self._download_webpage(webpage_url, video_id)
1701
1702         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1703
1704         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1705         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1706
1707         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1708             first_xml, u'node_id')
1709
1710         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1711         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1712
1713         self.report_extraction(video_id)
1714
1715         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1716         if mobj is None:
1717             raise ExtractorError(u'Unable to extract video url')
1718         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1719
1720         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1721             webpage, u'video title')
1722
1723         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1724             webpage, u'description', fatal=False)
1725
1726         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1727             webpage, u'thumbnail', fatal=False)
1728
1729         return [{
1730             'id':          video_id,
1731             'url':         video_url,
1732             'ext':         'mp4',
1733             'title':       video_title,
1734             'description': video_description,
1735             'thumbnail':   thumbnail,
1736             'uploader_id': video_uploader_id,
1737         }]
1738
1739 class TeamcocoIE(InfoExtractor):
1740     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1741
1742     def _real_extract(self, url):
1743         mobj = re.match(self._VALID_URL, url)
1744         if mobj is None:
1745             raise ExtractorError(u'Invalid URL: %s' % url)
1746         url_title = mobj.group('url_title')
1747         webpage = self._download_webpage(url, url_title)
1748
1749         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1750             webpage, u'video id')
1751
1752         self.report_extraction(video_id)
1753
1754         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1755             webpage, u'title')
1756
1757         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1758             webpage, u'thumbnail', fatal=False)
1759
1760         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1761             webpage, u'description', fatal=False)
1762
1763         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1764         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1765
1766         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1767             data, u'video URL')
1768
1769         return [{
1770             'id':          video_id,
1771             'url':         video_url,
1772             'ext':         'mp4',
1773             'title':       video_title,
1774             'thumbnail':   thumbnail,
1775             'description': video_description,
1776         }]
1777
1778 class XHamsterIE(InfoExtractor):
1779     """Information Extractor for xHamster"""
1780     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1781
1782     def _real_extract(self,url):
1783         mobj = re.match(self._VALID_URL, url)
1784
1785         video_id = mobj.group('id')
1786         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1787         webpage = self._download_webpage(mrss_url, video_id)
1788
1789         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1790         if mobj is None:
1791             raise ExtractorError(u'Unable to extract media URL')
1792         if len(mobj.group('server')) == 0:
1793             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1794         else:
1795             video_url = mobj.group('server')+'/key='+mobj.group('file')
1796         video_extension = video_url.split('.')[-1]
1797
1798         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1799             webpage, u'title')
1800
1801         # Can't see the description anywhere in the UI
1802         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1803         #     webpage, u'description', fatal=False)
1804         # if video_description: video_description = unescapeHTML(video_description)
1805
1806         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1807         if mobj:
1808             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1809         else:
1810             video_upload_date = None
1811             self._downloader.report_warning(u'Unable to extract upload date')
1812
1813         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1814             webpage, u'uploader id', default=u'anonymous')
1815
1816         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1817             webpage, u'thumbnail', fatal=False)
1818
1819         return [{
1820             'id':       video_id,
1821             'url':      video_url,
1822             'ext':      video_extension,
1823             'title':    video_title,
1824             # 'description': video_description,
1825             'upload_date': video_upload_date,
1826             'uploader_id': video_uploader_id,
1827             'thumbnail': video_thumbnail
1828         }]
1829
1830 class HypemIE(InfoExtractor):
1831     """Information Extractor for hypem"""
1832     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1833
1834     def _real_extract(self, url):
1835         mobj = re.match(self._VALID_URL, url)
1836         if mobj is None:
1837             raise ExtractorError(u'Invalid URL: %s' % url)
1838         track_id = mobj.group(1)
1839
1840         data = { 'ax': 1, 'ts': time.time() }
1841         data_encoded = compat_urllib_parse.urlencode(data)
1842         complete_url = url + "?" + data_encoded
1843         request = compat_urllib_request.Request(complete_url)
1844         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1845         cookie = urlh.headers.get('Set-Cookie', '')
1846
1847         self.report_extraction(track_id)
1848
1849         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1850             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1851         try:
1852             track_list = json.loads(html_tracks)
1853             track = track_list[u'tracks'][0]
1854         except ValueError:
1855             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1856
1857         key = track[u"key"]
1858         track_id = track[u"id"]
1859         artist = track[u"artist"]
1860         title = track[u"song"]
1861
1862         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1863         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1864         request.add_header('cookie', cookie)
1865         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1866         try:
1867             song_data = json.loads(song_data_json)
1868         except ValueError:
1869             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1870         final_url = song_data[u"url"]
1871
1872         return [{
1873             'id':       track_id,
1874             'url':      final_url,
1875             'ext':      "mp3",
1876             'title':    title,
1877             'artist':   artist,
1878         }]
1879
1880 class Vbox7IE(InfoExtractor):
1881     """Information Extractor for Vbox7"""
1882     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1883
1884     def _real_extract(self,url):
1885         mobj = re.match(self._VALID_URL, url)
1886         if mobj is None:
1887             raise ExtractorError(u'Invalid URL: %s' % url)
1888         video_id = mobj.group(1)
1889
1890         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1891         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1892         redirect_url = urlh.geturl() + new_location
1893         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1894
1895         title = self._html_search_regex(r'<title>(.*)</title>',
1896             webpage, u'title').split('/')[0].strip()
1897
1898         ext = "flv"
1899         info_url = "http://vbox7.com/play/magare.do"
1900         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1901         info_request = compat_urllib_request.Request(info_url, data)
1902         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1903         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1904         if info_response is None:
1905             raise ExtractorError(u'Unable to extract the media url')
1906         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1907
1908         return [{
1909             'id':        video_id,
1910             'url':       final_url,
1911             'ext':       ext,
1912             'title':     title,
1913             'thumbnail': thumbnail_url,
1914         }]
1915
1916
1917 def gen_extractors():
1918     """ Return a list of an instance of every supported extractor.
1919     The order does matter; the first extractor matched is the one handling the URL.
1920     """
1921     return [
1922         YoutubePlaylistIE(),
1923         YoutubeChannelIE(),
1924         YoutubeUserIE(),
1925         YoutubeSearchIE(),
1926         YoutubeIE(),
1927         MetacafeIE(),
1928         DailymotionIE(),
1929         GoogleSearchIE(),
1930         PhotobucketIE(),
1931         YahooIE(),
1932         YahooSearchIE(),
1933         DepositFilesIE(),
1934         FacebookIE(),
1935         BlipTVIE(),
1936         BlipTVUserIE(),
1937         VimeoIE(),
1938         MyVideoIE(),
1939         ComedyCentralIE(),
1940         EscapistIE(),
1941         CollegeHumorIE(),
1942         XVideosIE(),
1943         SoundcloudSetIE(),
1944         SoundcloudIE(),
1945         InfoQIE(),
1946         MixcloudIE(),
1947         StanfordOpenClassroomIE(),
1948         MTVIE(),
1949         YoukuIE(),
1950         XNXXIE(),
1951         YouJizzIE(),
1952         PornotubeIE(),
1953         YouPornIE(),
1954         GooglePlusIE(),
1955         ArteTvIE(),
1956         NBAIE(),
1957         WorldStarHipHopIE(),
1958         JustinTVIE(),
1959         FunnyOrDieIE(),
1960         SteamIE(),
1961         UstreamIE(),
1962         RBMARadioIE(),
1963         EightTracksIE(),
1964         KeekIE(),
1965         TEDIE(),
1966         MySpassIE(),
1967         SpiegelIE(),
1968         LiveLeakIE(),
1969         ARDIE(),
1970         ZDFIE(),
1971         TumblrIE(),
1972         BandcampIE(),
1973         RedTubeIE(),
1974         InaIE(),
1975         HowcastIE(),
1976         VineIE(),
1977         FlickrIE(),
1978         TeamcocoIE(),
1979         XHamsterIE(),
1980         HypemIE(),
1981         Vbox7IE(),
1982         GametrailersIE(),
1983         StatigramIE(),
1984         GenericIE()
1985     ]
1986
1987 def get_info_extractor(ie_name):
1988     """Returns the info extractor class with the given ie_name"""
1989     return globals()[ie_name+'IE']