youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.dailymotion import DailymotionIE
  24 from .extractor.gametrailers import GametrailersIE
  25 from .extractor.generic import GenericIE
  26 from .extractor.metacafe import MetacafeIE
  27 from .extractor.statigram import StatigramIE
  28 from .extractor.photobucket import PhotobucketIE
  29 from .extractor.vimeo import VimeoIE
  30 from .extractor.yahoo import YahooIE, YahooSearchIE
  31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  32 from .extractor.zdf import ZDFIE
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50 class BlipTVUserIE(InfoExtractor):
  51     """Information Extractor for blip.tv users."""
  52
  53     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
  54     _PAGE_SIZE = 12
  55     IE_NAME = u'blip.tv:user'
  56
  57     def _real_extract(self, url):
  58         # Extract username
  59         mobj = re.match(self._VALID_URL, url)
  60         if mobj is None:
  61             raise ExtractorError(u'Invalid URL: %s' % url)
  62
  63         username = mobj.group(1)
  64
  65         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
  66
  67         page = self._download_webpage(url, username, u'Downloading user page')
  68         mobj = re.search(r'data-users-id="([^"]+)"', page)
  69         page_base = page_base % mobj.group(1)
  70
  71
  72         # Download video ids using BlipTV Ajax calls. Result size per
  73         # query is limited (currently to 12 videos) so we need to query
  74         # page by page until there are no video ids - it means we got
  75         # all of them.
  76
  77         video_ids = []
  78         pagenum = 1
  79
  80         while True:
  81             url = page_base + "&page=" + str(pagenum)
  82             page = self._download_webpage(url, username,
  83                                           u'Downloading video ids from page %d' % pagenum)
  84
  85             # Extract video identifiers
  86             ids_in_page = []
  87
  88             for mobj in re.finditer(r'href="/([^"]+)"', page):
  89                 if mobj.group(1) not in ids_in_page:
  90                     ids_in_page.append(unescapeHTML(mobj.group(1)))
  91
  92             video_ids.extend(ids_in_page)
  93
  94             # A little optimization - if current page is not
  95             # "full", ie. does not contain PAGE_SIZE video ids then
  96             # we can assume that this page is the last one - there
  97             # are no more ids on further pages - no need to query
  98             # again.
  99
 100             if len(ids_in_page) < self._PAGE_SIZE:
 101                 break
 102
 103             pagenum += 1
 104
 105         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 106         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 107         return [self.playlist_result(url_entries, playlist_title = username)]
 108
 109
 110 class DepositFilesIE(InfoExtractor):
 111     """Information extractor for depositfiles.com"""
 112
 113     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 114
 115     def _real_extract(self, url):
 116         file_id = url.split('/')[-1]
 117         # Rebuild url in english locale
 118         url = 'http://depositfiles.com/en/files/' + file_id
 119
 120         # Retrieve file webpage with 'Free download' button pressed
 121         free_download_indication = { 'gateway_result' : '1' }
 122         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 123         try:
 124             self.report_download_webpage(file_id)
 125             webpage = compat_urllib_request.urlopen(request).read()
 126         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 127             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 128
 129         # Search for the real file URL
 130         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 131         if (mobj is None) or (mobj.group(1) is None):
 132             # Try to figure out reason of the error.
 133             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 134             if (mobj is not None) and (mobj.group(1) is not None):
 135                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 136                 raise ExtractorError(u'%s' % restriction_message)
 137             else:
 138                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 139
 140         file_url = mobj.group(1)
 141         file_extension = os.path.splitext(file_url)[1][1:]
 142
 143         # Search for file title
 144         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 145
 146         return [{
 147             'id':       file_id.decode('utf-8'),
 148             'url':      file_url.decode('utf-8'),
 149             'uploader': None,
 150             'upload_date':  None,
 151             'title':    file_title,
 152             'ext':      file_extension.decode('utf-8'),
 153         }]
 154
 155
 156 class FacebookIE(InfoExtractor):
 157     """Information Extractor for Facebook"""
 158
 159     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 160     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 161     _NETRC_MACHINE = 'facebook'
 162     IE_NAME = u'facebook'
 163
 164     def report_login(self):
 165         """Report attempt to log in."""
 166         self.to_screen(u'Logging in')
 167
 168     def _real_initialize(self):
 169         if self._downloader is None:
 170             return
 171
 172         useremail = None
 173         password = None
 174         downloader_params = self._downloader.params
 175
 176         # Attempt to use provided username and password or .netrc data
 177         if downloader_params.get('username', None) is not None:
 178             useremail = downloader_params['username']
 179             password = downloader_params['password']
 180         elif downloader_params.get('usenetrc', False):
 181             try:
 182                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 183                 if info is not None:
 184                     useremail = info[0]
 185                     password = info[2]
 186                 else:
 187                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 188             except (IOError, netrc.NetrcParseError) as err:
 189                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 190                 return
 191
 192         if useremail is None:
 193             return
 194
 195         # Log in
 196         login_form = {
 197             'email': useremail,
 198             'pass': password,
 199             'login': 'Log+In'
 200             }
 201         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 202         try:
 203             self.report_login()
 204             login_results = compat_urllib_request.urlopen(request).read()
 205             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 206                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 207                 return
 208         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 209             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 210             return
 211
 212     def _real_extract(self, url):
 213         mobj = re.match(self._VALID_URL, url)
 214         if mobj is None:
 215             raise ExtractorError(u'Invalid URL: %s' % url)
 216         video_id = mobj.group('ID')
 217
 218         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 219         webpage = self._download_webpage(url, video_id)
 220
 221         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 222         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 223         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 224         if not m:
 225             raise ExtractorError(u'Cannot parse data')
 226         data = dict(json.loads(m.group(1)))
 227         params_raw = compat_urllib_parse.unquote(data['params'])
 228         params = json.loads(params_raw)
 229         video_data = params['video_data'][0]
 230         video_url = video_data.get('hd_src')
 231         if not video_url:
 232             video_url = video_data['sd_src']
 233         if not video_url:
 234             raise ExtractorError(u'Cannot find video URL')
 235         video_duration = int(video_data['video_duration'])
 236         thumbnail = video_data['thumbnail_src']
 237
 238         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 239             webpage, u'title')
 240
 241         info = {
 242             'id': video_id,
 243             'title': video_title,
 244             'url': video_url,
 245             'ext': 'mp4',
 246             'duration': video_duration,
 247             'thumbnail': thumbnail,
 248         }
 249         return [info]
 250
 251
 252 class BlipTVIE(InfoExtractor):
 253     """Information extractor for blip.tv"""
 254
 255     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 256     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 257     IE_NAME = u'blip.tv'
 258
 259     def report_direct_download(self, title):
 260         """Report information extraction."""
 261         self.to_screen(u'%s: Direct download detected' % title)
 262
 263     def _real_extract(self, url):
 264         mobj = re.match(self._VALID_URL, url)
 265         if mobj is None:
 266             raise ExtractorError(u'Invalid URL: %s' % url)
 267
 268         # See https://github.com/rg3/youtube-dl/issues/857
 269         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 270         if api_mobj is not None:
 271             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 272         urlp = compat_urllib_parse_urlparse(url)
 273         if urlp.path.startswith('/play/'):
 274             request = compat_urllib_request.Request(url)
 275             response = compat_urllib_request.urlopen(request)
 276             redirecturl = response.geturl()
 277             rurlp = compat_urllib_parse_urlparse(redirecturl)
 278             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 279             url = 'http://blip.tv/a/a-' + file_id
 280             return self._real_extract(url)
 281
 282
 283         if '?' in url:
 284             cchar = '&'
 285         else:
 286             cchar = '?'
 287         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 288         request = compat_urllib_request.Request(json_url)
 289         request.add_header('User-Agent', 'iTunes/10.6.1')
 290         self.report_extraction(mobj.group(1))
 291         info = None
 292         try:
 293             urlh = compat_urllib_request.urlopen(request)
 294             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 295                 basename = url.split('/')[-1]
 296                 title,ext = os.path.splitext(basename)
 297                 title = title.decode('UTF-8')
 298                 ext = ext.replace('.', '')
 299                 self.report_direct_download(title)
 300                 info = {
 301                     'id': title,
 302                     'url': url,
 303                     'uploader': None,
 304                     'upload_date': None,
 305                     'title': title,
 306                     'ext': ext,
 307                     'urlhandle': urlh
 308                 }
 309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 310             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 311         if info is None: # Regular URL
 312             try:
 313                 json_code_bytes = urlh.read()
 314                 json_code = json_code_bytes.decode('utf-8')
 315             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 316                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 317
 318             try:
 319                 json_data = json.loads(json_code)
 320                 if 'Post' in json_data:
 321                     data = json_data['Post']
 322                 else:
 323                     data = json_data
 324
 325                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 326                 video_url = data['media']['url']
 327                 umobj = re.match(self._URL_EXT, video_url)
 328                 if umobj is None:
 329                     raise ValueError('Can not determine filename extension')
 330                 ext = umobj.group(1)
 331
 332                 info = {
 333                     'id': data['item_id'],
 334                     'url': video_url,
 335                     'uploader': data['display_name'],
 336                     'upload_date': upload_date,
 337                     'title': data['title'],
 338                     'ext': ext,
 339                     'format': data['media']['mimeType'],
 340                     'thumbnail': data['thumbnailUrl'],
 341                     'description': data['description'],
 342                     'player_url': data['embedUrl'],
 343                     'user_agent': 'iTunes/10.6.1',
 344                 }
 345             except (ValueError,KeyError) as err:
 346                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 347
 348         return [info]
 349
 350
 351 class MyVideoIE(InfoExtractor):
 352     """Information Extractor for myvideo.de."""
 353
 354     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 355     IE_NAME = u'myvideo'
 356
 357     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 358     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 359     # https://github.com/rg3/youtube-dl/pull/842
 360     def __rc4crypt(self,data, key):
 361         x = 0
 362         box = list(range(256))
 363         for i in list(range(256)):
 364             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 365             box[i], box[x] = box[x], box[i]
 366         x = 0
 367         y = 0
 368         out = ''
 369         for char in data:
 370             x = (x + 1) % 256
 371             y = (y + box[x]) % 256
 372             box[x], box[y] = box[y], box[x]
 373             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 374         return out
 375
 376     def __md5(self,s):
 377         return hashlib.md5(s).hexdigest().encode()
 378
 379     def _real_extract(self,url):
 380         mobj = re.match(self._VALID_URL, url)
 381         if mobj is None:
 382             raise ExtractorError(u'invalid URL: %s' % url)
 383
 384         video_id = mobj.group(1)
 385
 386         GK = (
 387           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 388           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 389           b'TnpsbA0KTVRkbU1tSTRNdz09'
 390         )
 391
 392         # Get video webpage
 393         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 394         webpage = self._download_webpage(webpage_url, video_id)
 395
 396         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 397         if mobj is not None:
 398             self.report_extraction(video_id)
 399             video_url = mobj.group(1) + '.flv'
 400
 401             video_title = self._html_search_regex('<title>([^<]+)</title>',
 402                 webpage, u'title')
 403
 404             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 405
 406             return [{
 407                 'id':       video_id,
 408                 'url':      video_url,
 409                 'uploader': None,
 410                 'upload_date':  None,
 411                 'title':    video_title,
 412                 'ext':      u'flv',
 413             }]
 414
 415         # try encxml
 416         mobj = re.search('var flashvars={(.+?)}', webpage)
 417         if mobj is None:
 418             raise ExtractorError(u'Unable to extract video')
 419
 420         params = {}
 421         encxml = ''
 422         sec = mobj.group(1)
 423         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 424             if not a == '_encxml':
 425                 params[a] = b
 426             else:
 427                 encxml = compat_urllib_parse.unquote(b)
 428         if not params.get('domain'):
 429             params['domain'] = 'www.myvideo.de'
 430         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 431         if 'flash_playertype=MTV' in xmldata_url:
 432             self._downloader.report_warning(u'avoiding MTV player')
 433             xmldata_url = (
 434                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 435                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 436             ) % video_id
 437
 438         # get enc data
 439         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 440         enc_data_b = binascii.unhexlify(enc_data)
 441         sk = self.__md5(
 442             base64.b64decode(base64.b64decode(GK)) +
 443             self.__md5(
 444                 str(video_id).encode('utf-8')
 445             )
 446         )
 447         dec_data = self.__rc4crypt(enc_data_b, sk)
 448
 449         # extracting infos
 450         self.report_extraction(video_id)
 451
 452         video_url = None
 453         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 454         if mobj:
 455             video_url = compat_urllib_parse.unquote(mobj.group(1))
 456             if 'myvideo2flash' in video_url:
 457                 self._downloader.report_warning(u'forcing RTMPT ...')
 458                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 459
 460         if not video_url:
 461             # extract non rtmp videos
 462             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 463             if mobj is None:
 464                 raise ExtractorError(u'unable to extract url')
 465             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 466
 467         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 468         video_file = compat_urllib_parse.unquote(video_file)
 469
 470         if not video_file.endswith('f4m'):
 471             ppath, prefix = video_file.split('.')
 472             video_playpath = '%s:%s' % (prefix, ppath)
 473             video_hls_playlist = ''
 474         else:
 475             video_playpath = ''
 476             video_hls_playlist = (
 477                 video_filepath + video_file
 478             ).replace('.f4m', '.m3u8')
 479
 480         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 481         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 482
 483         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 484             webpage, u'title')
 485
 486         return [{
 487             'id':                 video_id,
 488             'url':                video_url,
 489             'tc_url':             video_url,
 490             'uploader':           None,
 491             'upload_date':        None,
 492             'title':              video_title,
 493             'ext':                u'flv',
 494             'play_path':          video_playpath,
 495             'video_file':         video_file,
 496             'video_hls_playlist': video_hls_playlist,
 497             'player_url':         video_swfobj,
 498         }]
 499
 500
 501 class ComedyCentralIE(InfoExtractor):
 502     """Information extractor for The Daily Show and Colbert Report """
 503
 504     # urls can be abbreviations like :thedailyshow or :colbert
 505     # urls for episodes like:
 506     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 507     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 508     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 509     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 510                       |(https?://)?(www\.)?
 511                           (?P<showname>thedailyshow|colbertnation)\.com/
 512                          (full-episodes/(?P<episode>.*)|
 513                           (?P<clip>
 514                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 515                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 516                      $"""
 517
 518     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 519
 520     _video_extensions = {
 521         '3500': 'mp4',
 522         '2200': 'mp4',
 523         '1700': 'mp4',
 524         '1200': 'mp4',
 525         '750': 'mp4',
 526         '400': 'mp4',
 527     }
 528     _video_dimensions = {
 529         '3500': '1280x720',
 530         '2200': '960x540',
 531         '1700': '768x432',
 532         '1200': '640x360',
 533         '750': '512x288',
 534         '400': '384x216',
 535     }
 536
 537     @classmethod
 538     def suitable(cls, url):
 539         """Receives a URL and returns True if suitable for this IE."""
 540         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 541
 542     def _print_formats(self, formats):
 543         print('Available formats:')
 544         for x in formats:
 545             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 546
 547
 548     def _real_extract(self, url):
 549         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 550         if mobj is None:
 551             raise ExtractorError(u'Invalid URL: %s' % url)
 552
 553         if mobj.group('shortname'):
 554             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 555                 url = u'http://www.thedailyshow.com/full-episodes/'
 556             else:
 557                 url = u'http://www.colbertnation.com/full-episodes/'
 558             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 559             assert mobj is not None
 560
 561         if mobj.group('clip'):
 562             if mobj.group('showname') == 'thedailyshow':
 563                 epTitle = mobj.group('tdstitle')
 564             else:
 565                 epTitle = mobj.group('cntitle')
 566             dlNewest = False
 567         else:
 568             dlNewest = not mobj.group('episode')
 569             if dlNewest:
 570                 epTitle = mobj.group('showname')
 571             else:
 572                 epTitle = mobj.group('episode')
 573
 574         self.report_extraction(epTitle)
 575         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 576         if dlNewest:
 577             url = htmlHandle.geturl()
 578             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 579             if mobj is None:
 580                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 581             if mobj.group('episode') == '':
 582                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 583             epTitle = mobj.group('episode')
 584
 585         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 586
 587         if len(mMovieParams) == 0:
 588             # The Colbert Report embeds the information in a without
 589             # a URL prefix; so extract the alternate reference
 590             # and then add the URL prefix manually.
 591
 592             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 593             if len(altMovieParams) == 0:
 594                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 595             else:
 596                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 597
 598         uri = mMovieParams[0][1]
 599         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 600         indexXml = self._download_webpage(indexUrl, epTitle,
 601                                           u'Downloading show index',
 602                                           u'unable to download episode index')
 603
 604         results = []
 605
 606         idoc = xml.etree.ElementTree.fromstring(indexXml)
 607         itemEls = idoc.findall('.//item')
 608         for partNum,itemEl in enumerate(itemEls):
 609             mediaId = itemEl.findall('./guid')[0].text
 610             shortMediaId = mediaId.split(':')[-1]
 611             showId = mediaId.split(':')[-2].replace('.com', '')
 612             officialTitle = itemEl.findall('./title')[0].text
 613             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 614
 615             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 616                         compat_urllib_parse.urlencode({'uri': mediaId}))
 617             configXml = self._download_webpage(configUrl, epTitle,
 618                                                u'Downloading configuration for %s' % shortMediaId)
 619
 620             cdoc = xml.etree.ElementTree.fromstring(configXml)
 621             turls = []
 622             for rendition in cdoc.findall('.//rendition'):
 623                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 624                 turls.append(finfo)
 625
 626             if len(turls) == 0:
 627                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 628                 continue
 629
 630             if self._downloader.params.get('listformats', None):
 631                 self._print_formats([i[0] for i in turls])
 632                 return
 633
 634             # For now, just pick the highest bitrate
 635             format,rtmp_video_url = turls[-1]
 636
 637             # Get the format arg from the arg stream
 638             req_format = self._downloader.params.get('format', None)
 639
 640             # Select format if we can find one
 641             for f,v in turls:
 642                 if f == req_format:
 643                     format, rtmp_video_url = f, v
 644                     break
 645
 646             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 647             if not m:
 648                 raise ExtractorError(u'Cannot transform RTMP url')
 649             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 650             video_url = base + m.group('finalid')
 651
 652             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 653             info = {
 654                 'id': shortMediaId,
 655                 'url': video_url,
 656                 'uploader': showId,
 657                 'upload_date': officialDate,
 658                 'title': effTitle,
 659                 'ext': 'mp4',
 660                 'format': format,
 661                 'thumbnail': None,
 662                 'description': officialTitle,
 663             }
 664             results.append(info)
 665
 666         return results
 667
 668
 669 class EscapistIE(InfoExtractor):
 670     """Information extractor for The Escapist """
 671
 672     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 673     IE_NAME = u'escapist'
 674
 675     def _real_extract(self, url):
 676         mobj = re.match(self._VALID_URL, url)
 677         if mobj is None:
 678             raise ExtractorError(u'Invalid URL: %s' % url)
 679         showName = mobj.group('showname')
 680         videoId = mobj.group('episode')
 681
 682         self.report_extraction(videoId)
 683         webpage = self._download_webpage(url, videoId)
 684
 685         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 686             webpage, u'description', fatal=False)
 687
 688         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 689             webpage, u'thumbnail', fatal=False)
 690
 691         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 692             webpage, u'player url')
 693
 694         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 695             webpage, u'player url').split(' : ')[-1]
 696
 697         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 698         configUrl = compat_urllib_parse.unquote(configUrl)
 699
 700         configJSON = self._download_webpage(configUrl, videoId,
 701                                             u'Downloading configuration',
 702                                             u'unable to download configuration')
 703
 704         # Technically, it's JavaScript, not JSON
 705         configJSON = configJSON.replace("'", '"')
 706
 707         try:
 708             config = json.loads(configJSON)
 709         except (ValueError,) as err:
 710             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 711
 712         playlist = config['playlist']
 713         videoUrl = playlist[1]['url']
 714
 715         info = {
 716             'id': videoId,
 717             'url': videoUrl,
 718             'uploader': showName,
 719             'upload_date': None,
 720             'title': title,
 721             'ext': 'mp4',
 722             'thumbnail': imgUrl,
 723             'description': videoDesc,
 724             'player_url': playerUrl,
 725         }
 726
 727         return [info]
 728
 729 class CollegeHumorIE(InfoExtractor):
 730     """Information extractor for collegehumor.com"""
 731
 732     _WORKING = False
 733     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 734     IE_NAME = u'collegehumor'
 735
 736     def report_manifest(self, video_id):
 737         """Report information extraction."""
 738         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 739
 740     def _real_extract(self, url):
 741         mobj = re.match(self._VALID_URL, url)
 742         if mobj is None:
 743             raise ExtractorError(u'Invalid URL: %s' % url)
 744         video_id = mobj.group('videoid')
 745
 746         info = {
 747             'id': video_id,
 748             'uploader': None,
 749             'upload_date': None,
 750         }
 751
 752         self.report_extraction(video_id)
 753         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 754         try:
 755             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 756         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 757             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 758
 759         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 760         try:
 761             videoNode = mdoc.findall('./video')[0]
 762             info['description'] = videoNode.findall('./description')[0].text
 763             info['title'] = videoNode.findall('./caption')[0].text
 764             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 765             manifest_url = videoNode.findall('./file')[0].text
 766         except IndexError:
 767             raise ExtractorError(u'Invalid metadata XML file')
 768
 769         manifest_url += '?hdcore=2.10.3'
 770         self.report_manifest(video_id)
 771         try:
 772             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 773         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 774             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 775
 776         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 777         try:
 778             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 779             node_id = media_node.attrib['url']
 780             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 781         except IndexError as err:
 782             raise ExtractorError(u'Invalid manifest file')
 783
 784         url_pr = compat_urllib_parse_urlparse(manifest_url)
 785         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 786
 787         info['url'] = url
 788         info['ext'] = 'f4f'
 789         return [info]
 790
 791
 792 class XVideosIE(InfoExtractor):
 793     """Information extractor for xvideos.com"""
 794
 795     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 796     IE_NAME = u'xvideos'
 797
 798     def _real_extract(self, url):
 799         mobj = re.match(self._VALID_URL, url)
 800         if mobj is None:
 801             raise ExtractorError(u'Invalid URL: %s' % url)
 802         video_id = mobj.group(1)
 803
 804         webpage = self._download_webpage(url, video_id)
 805
 806         self.report_extraction(video_id)
 807
 808         # Extract video URL
 809         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 810             webpage, u'video URL'))
 811
 812         # Extract title
 813         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 814             webpage, u'title')
 815
 816         # Extract video thumbnail
 817         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 818             webpage, u'thumbnail', fatal=False)
 819
 820         info = {
 821             'id': video_id,
 822             'url': video_url,
 823             'uploader': None,
 824             'upload_date': None,
 825             'title': video_title,
 826             'ext': 'flv',
 827             'thumbnail': video_thumbnail,
 828             'description': None,
 829         }
 830
 831         return [info]
 832
 833
 834 class SoundcloudIE(InfoExtractor):
 835     """Information extractor for soundcloud.com
 836        To access the media, the uid of the song and a stream token
 837        must be extracted from the page source and the script must make
 838        a request to media.soundcloud.com/crossdomain.xml. Then
 839        the media can be grabbed by requesting from an url composed
 840        of the stream token and uid
 841      """
 842
 843     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 844     IE_NAME = u'soundcloud'
 845
 846     def report_resolve(self, video_id):
 847         """Report information extraction."""
 848         self.to_screen(u'%s: Resolving id' % video_id)
 849
 850     def _real_extract(self, url):
 851         mobj = re.match(self._VALID_URL, url)
 852         if mobj is None:
 853             raise ExtractorError(u'Invalid URL: %s' % url)
 854
 855         # extract uploader (which is in the url)
 856         uploader = mobj.group(1)
 857         # extract simple title (uploader + slug of song title)
 858         slug_title =  mobj.group(2)
 859         simple_title = uploader + u'-' + slug_title
 860         full_title = '%s/%s' % (uploader, slug_title)
 861
 862         self.report_resolve(full_title)
 863
 864         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 865         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 866         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 867
 868         info = json.loads(info_json)
 869         video_id = info['id']
 870         self.report_extraction(full_title)
 871
 872         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 873         stream_json = self._download_webpage(streams_url, full_title,
 874                                              u'Downloading stream definitions',
 875                                              u'unable to download stream definitions')
 876
 877         streams = json.loads(stream_json)
 878         mediaURL = streams['http_mp3_128_url']
 879         upload_date = unified_strdate(info['created_at'])
 880
 881         return [{
 882             'id':       info['id'],
 883             'url':      mediaURL,
 884             'uploader': info['user']['username'],
 885             'upload_date': upload_date,
 886             'title':    info['title'],
 887             'ext':      u'mp3',
 888             'description': info['description'],
 889         }]
 890
 891 class SoundcloudSetIE(InfoExtractor):
 892     """Information extractor for soundcloud.com sets
 893        To access the media, the uid of the song and a stream token
 894        must be extracted from the page source and the script must make
 895        a request to media.soundcloud.com/crossdomain.xml. Then
 896        the media can be grabbed by requesting from an url composed
 897        of the stream token and uid
 898      """
 899
 900     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 901     IE_NAME = u'soundcloud:set'
 902
 903     def report_resolve(self, video_id):
 904         """Report information extraction."""
 905         self.to_screen(u'%s: Resolving id' % video_id)
 906
 907     def _real_extract(self, url):
 908         mobj = re.match(self._VALID_URL, url)
 909         if mobj is None:
 910             raise ExtractorError(u'Invalid URL: %s' % url)
 911
 912         # extract uploader (which is in the url)
 913         uploader = mobj.group(1)
 914         # extract simple title (uploader + slug of song title)
 915         slug_title =  mobj.group(2)
 916         simple_title = uploader + u'-' + slug_title
 917         full_title = '%s/sets/%s' % (uploader, slug_title)
 918
 919         self.report_resolve(full_title)
 920
 921         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 922         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 923         info_json = self._download_webpage(resolv_url, full_title)
 924
 925         videos = []
 926         info = json.loads(info_json)
 927         if 'errors' in info:
 928             for err in info['errors']:
 929                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 930             return
 931
 932         self.report_extraction(full_title)
 933         for track in info['tracks']:
 934             video_id = track['id']
 935
 936             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 937             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
 938
 939             self.report_extraction(video_id)
 940             streams = json.loads(stream_json)
 941             mediaURL = streams['http_mp3_128_url']
 942
 943             videos.append({
 944                 'id':       video_id,
 945                 'url':      mediaURL,
 946                 'uploader': track['user']['username'],
 947                 'upload_date':  unified_strdate(track['created_at']),
 948                 'title':    track['title'],
 949                 'ext':      u'mp3',
 950                 'description': track['description'],
 951             })
 952         return videos
 953
 954
 955 class InfoQIE(InfoExtractor):
 956     """Information extractor for infoq.com"""
 957     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 958
 959     def _real_extract(self, url):
 960         mobj = re.match(self._VALID_URL, url)
 961         if mobj is None:
 962             raise ExtractorError(u'Invalid URL: %s' % url)
 963
 964         webpage = self._download_webpage(url, video_id=url)
 965         self.report_extraction(url)
 966
 967         # Extract video URL
 968         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 969         if mobj is None:
 970             raise ExtractorError(u'Unable to extract video url')
 971         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 972         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 973
 974         # Extract title
 975         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 976             webpage, u'title')
 977
 978         # Extract description
 979         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 980             webpage, u'description', fatal=False)
 981
 982         video_filename = video_url.split('/')[-1]
 983         video_id, extension = video_filename.split('.')
 984
 985         info = {
 986             'id': video_id,
 987             'url': video_url,
 988             'uploader': None,
 989             'upload_date': None,
 990             'title': video_title,
 991             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 992             'thumbnail': None,
 993             'description': video_description,
 994         }
 995
 996         return [info]
 997
 998 class MixcloudIE(InfoExtractor):
 999     """Information extractor for www.mixcloud.com"""
1000
1001     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1002     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1003     IE_NAME = u'mixcloud'
1004
1005     def report_download_json(self, file_id):
1006         """Report JSON download."""
1007         self.to_screen(u'Downloading json')
1008
1009     def get_urls(self, jsonData, fmt, bitrate='best'):
1010         """Get urls from 'audio_formats' section in json"""
1011         file_url = None
1012         try:
1013             bitrate_list = jsonData[fmt]
1014             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1015                 bitrate = max(bitrate_list) # select highest
1016
1017             url_list = jsonData[fmt][bitrate]
1018         except TypeError: # we have no bitrate info.
1019             url_list = jsonData[fmt]
1020         return url_list
1021
1022     def check_urls(self, url_list):
1023         """Returns 1st active url from list"""
1024         for url in url_list:
1025             try:
1026                 compat_urllib_request.urlopen(url)
1027                 return url
1028             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1029                 url = None
1030
1031         return None
1032
1033     def _print_formats(self, formats):
1034         print('Available formats:')
1035         for fmt in formats.keys():
1036             for b in formats[fmt]:
1037                 try:
1038                     ext = formats[fmt][b][0]
1039                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1040                 except TypeError: # we have no bitrate info
1041                     ext = formats[fmt][0]
1042                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1043                     break
1044
1045     def _real_extract(self, url):
1046         mobj = re.match(self._VALID_URL, url)
1047         if mobj is None:
1048             raise ExtractorError(u'Invalid URL: %s' % url)
1049         # extract uploader & filename from url
1050         uploader = mobj.group(1).decode('utf-8')
1051         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1052
1053         # construct API request
1054         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1055         # retrieve .json file with links to files
1056         request = compat_urllib_request.Request(file_url)
1057         try:
1058             self.report_download_json(file_url)
1059             jsonData = compat_urllib_request.urlopen(request).read()
1060         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1062
1063         # parse JSON
1064         json_data = json.loads(jsonData)
1065         player_url = json_data['player_swf_url']
1066         formats = dict(json_data['audio_formats'])
1067
1068         req_format = self._downloader.params.get('format', None)
1069         bitrate = None
1070
1071         if self._downloader.params.get('listformats', None):
1072             self._print_formats(formats)
1073             return
1074
1075         if req_format is None or req_format == 'best':
1076             for format_param in formats.keys():
1077                 url_list = self.get_urls(formats, format_param)
1078                 # check urls
1079                 file_url = self.check_urls(url_list)
1080                 if file_url is not None:
1081                     break # got it!
1082         else:
1083             if req_format not in formats:
1084                 raise ExtractorError(u'Format is not available')
1085
1086             url_list = self.get_urls(formats, req_format)
1087             file_url = self.check_urls(url_list)
1088             format_param = req_format
1089
1090         return [{
1091             'id': file_id.decode('utf-8'),
1092             'url': file_url.decode('utf-8'),
1093             'uploader': uploader.decode('utf-8'),
1094             'upload_date': None,
1095             'title': json_data['name'],
1096             'ext': file_url.split('.')[-1].decode('utf-8'),
1097             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1098             'thumbnail': json_data['thumbnail_url'],
1099             'description': json_data['description'],
1100             'player_url': player_url.decode('utf-8'),
1101         }]
1102
1103 class StanfordOpenClassroomIE(InfoExtractor):
1104     """Information extractor for Stanford's Open ClassRoom"""
1105
1106     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1107     IE_NAME = u'stanfordoc'
1108
1109     def _real_extract(self, url):
1110         mobj = re.match(self._VALID_URL, url)
1111         if mobj is None:
1112             raise ExtractorError(u'Invalid URL: %s' % url)
1113
1114         if mobj.group('course') and mobj.group('video'): # A specific video
1115             course = mobj.group('course')
1116             video = mobj.group('video')
1117             info = {
1118                 'id': course + '_' + video,
1119                 'uploader': None,
1120                 'upload_date': None,
1121             }
1122
1123             self.report_extraction(info['id'])
1124             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1125             xmlUrl = baseUrl + video + '.xml'
1126             try:
1127                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1128             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1130             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1131             try:
1132                 info['title'] = mdoc.findall('./title')[0].text
1133                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1134             except IndexError:
1135                 raise ExtractorError(u'Invalid metadata XML file')
1136             info['ext'] = info['url'].rpartition('.')[2]
1137             return [info]
1138         elif mobj.group('course'): # A course page
1139             course = mobj.group('course')
1140             info = {
1141                 'id': course,
1142                 'type': 'playlist',
1143                 'uploader': None,
1144                 'upload_date': None,
1145             }
1146
1147             coursepage = self._download_webpage(url, info['id'],
1148                                         note='Downloading course info page',
1149                                         errnote='Unable to download course info page')
1150
1151             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1152
1153             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1154                 coursepage, u'description', fatal=False)
1155
1156             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1157             info['list'] = [
1158                 {
1159                     'type': 'reference',
1160                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1161                 }
1162                     for vpage in links]
1163             results = []
1164             for entry in info['list']:
1165                 assert entry['type'] == 'reference'
1166                 results += self.extract(entry['url'])
1167             return results
1168         else: # Root page
1169             info = {
1170                 'id': 'Stanford OpenClassroom',
1171                 'type': 'playlist',
1172                 'uploader': None,
1173                 'upload_date': None,
1174             }
1175
1176             self.report_download_webpage(info['id'])
1177             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1178             try:
1179                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1180             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1181                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1182
1183             info['title'] = info['id']
1184
1185             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1186             info['list'] = [
1187                 {
1188                     'type': 'reference',
1189                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1190                 }
1191                     for cpage in links]
1192
1193             results = []
1194             for entry in info['list']:
1195                 assert entry['type'] == 'reference'
1196                 results += self.extract(entry['url'])
1197             return results
1198
1199 class MTVIE(InfoExtractor):
1200     """Information extractor for MTV.com"""
1201
1202     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1203     IE_NAME = u'mtv'
1204
1205     def _real_extract(self, url):
1206         mobj = re.match(self._VALID_URL, url)
1207         if mobj is None:
1208             raise ExtractorError(u'Invalid URL: %s' % url)
1209         if not mobj.group('proto'):
1210             url = 'http://' + url
1211         video_id = mobj.group('videoid')
1212
1213         webpage = self._download_webpage(url, video_id)
1214
1215         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1216             webpage, u'song name', fatal=False)
1217
1218         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1219             webpage, u'title')
1220
1221         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1222             webpage, u'mtvn_uri', fatal=False)
1223
1224         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1225             webpage, u'content id', fatal=False)
1226
1227         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1228         self.report_extraction(video_id)
1229         request = compat_urllib_request.Request(videogen_url)
1230         try:
1231             metadataXml = compat_urllib_request.urlopen(request).read()
1232         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1233             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1234
1235         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1236         renditions = mdoc.findall('.//rendition')
1237
1238         # For now, always pick the highest quality.
1239         rendition = renditions[-1]
1240
1241         try:
1242             _,_,ext = rendition.attrib['type'].partition('/')
1243             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1244             video_url = rendition.find('./src').text
1245         except KeyError:
1246             raise ExtractorError('Invalid rendition field.')
1247
1248         info = {
1249             'id': video_id,
1250             'url': video_url,
1251             'uploader': performer,
1252             'upload_date': None,
1253             'title': video_title,
1254             'ext': ext,
1255             'format': format,
1256         }
1257
1258         return [info]
1259
1260
1261 class YoukuIE(InfoExtractor):
1262     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1263
1264     def _gen_sid(self):
1265         nowTime = int(time.time() * 1000)
1266         random1 = random.randint(1000,1998)
1267         random2 = random.randint(1000,9999)
1268
1269         return "%d%d%d" %(nowTime,random1,random2)
1270
1271     def _get_file_ID_mix_string(self, seed):
1272         mixed = []
1273         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1274         seed = float(seed)
1275         for i in range(len(source)):
1276             seed  =  (seed * 211 + 30031 ) % 65536
1277             index  =  math.floor(seed / 65536 * len(source) )
1278             mixed.append(source[int(index)])
1279             source.remove(source[int(index)])
1280         #return ''.join(mixed)
1281         return mixed
1282
1283     def _get_file_id(self, fileId, seed):
1284         mixed = self._get_file_ID_mix_string(seed)
1285         ids = fileId.split('*')
1286         realId = []
1287         for ch in ids:
1288             if ch:
1289                 realId.append(mixed[int(ch)])
1290         return ''.join(realId)
1291
1292     def _real_extract(self, url):
1293         mobj = re.match(self._VALID_URL, url)
1294         if mobj is None:
1295             raise ExtractorError(u'Invalid URL: %s' % url)
1296         video_id = mobj.group('ID')
1297
1298         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1299
1300         jsondata = self._download_webpage(info_url, video_id)
1301
1302         self.report_extraction(video_id)
1303         try:
1304             config = json.loads(jsondata)
1305
1306             video_title =  config['data'][0]['title']
1307             seed = config['data'][0]['seed']
1308
1309             format = self._downloader.params.get('format', None)
1310             supported_format = list(config['data'][0]['streamfileids'].keys())
1311
1312             if format is None or format == 'best':
1313                 if 'hd2' in supported_format:
1314                     format = 'hd2'
1315                 else:
1316                     format = 'flv'
1317                 ext = u'flv'
1318             elif format == 'worst':
1319                 format = 'mp4'
1320                 ext = u'mp4'
1321             else:
1322                 format = 'flv'
1323                 ext = u'flv'
1324
1325
1326             fileid = config['data'][0]['streamfileids'][format]
1327             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1328         except (UnicodeDecodeError, ValueError, KeyError):
1329             raise ExtractorError(u'Unable to extract info section')
1330
1331         files_info=[]
1332         sid = self._gen_sid()
1333         fileid = self._get_file_id(fileid, seed)
1334
1335         #column 8,9 of fileid represent the segment number
1336         #fileid[7:9] should be changed
1337         for index, key in enumerate(keys):
1338
1339             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1340             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1341
1342             info = {
1343                 'id': '%s_part%02d' % (video_id, index),
1344                 'url': download_url,
1345                 'uploader': None,
1346                 'upload_date': None,
1347                 'title': video_title,
1348                 'ext': ext,
1349             }
1350             files_info.append(info)
1351
1352         return files_info
1353
1354
1355 class XNXXIE(InfoExtractor):
1356     """Information extractor for xnxx.com"""
1357
1358     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1359     IE_NAME = u'xnxx'
1360     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1361     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1362     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1363
1364     def _real_extract(self, url):
1365         mobj = re.match(self._VALID_URL, url)
1366         if mobj is None:
1367             raise ExtractorError(u'Invalid URL: %s' % url)
1368         video_id = mobj.group(1)
1369
1370         # Get webpage content
1371         webpage = self._download_webpage(url, video_id)
1372
1373         video_url = self._search_regex(self.VIDEO_URL_RE,
1374             webpage, u'video URL')
1375         video_url = compat_urllib_parse.unquote(video_url)
1376
1377         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1378             webpage, u'title')
1379
1380         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1381             webpage, u'thumbnail', fatal=False)
1382
1383         return [{
1384             'id': video_id,
1385             'url': video_url,
1386             'uploader': None,
1387             'upload_date': None,
1388             'title': video_title,
1389             'ext': 'flv',
1390             'thumbnail': video_thumbnail,
1391             'description': None,
1392         }]
1393
1394
1395 class GooglePlusIE(InfoExtractor):
1396     """Information extractor for plus.google.com."""
1397
1398     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1399     IE_NAME = u'plus.google'
1400
1401     def _real_extract(self, url):
1402         # Extract id from URL
1403         mobj = re.match(self._VALID_URL, url)
1404         if mobj is None:
1405             raise ExtractorError(u'Invalid URL: %s' % url)
1406
1407         post_url = mobj.group(0)
1408         video_id = mobj.group(1)
1409
1410         video_extension = 'flv'
1411
1412         # Step 1, Retrieve post webpage to extract further information
1413         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1414
1415         self.report_extraction(video_id)
1416
1417         # Extract update date
1418         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1419             webpage, u'upload date', fatal=False)
1420         if upload_date:
1421             # Convert timestring to a format suitable for filename
1422             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1423             upload_date = upload_date.strftime('%Y%m%d')
1424
1425         # Extract uploader
1426         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1427             webpage, u'uploader', fatal=False)
1428
1429         # Extract title
1430         # Get the first line for title
1431         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1432             webpage, 'title', default=u'NA')
1433
1434         # Step 2, Stimulate clicking the image box to launch video
1435         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1436             webpage, u'video page URL')
1437         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1438
1439         # Extract video links on video page
1440         """Extract video links of all sizes"""
1441         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1442         mobj = re.findall(pattern, webpage)
1443         if len(mobj) == 0:
1444             raise ExtractorError(u'Unable to extract video links')
1445
1446         # Sort in resolution
1447         links = sorted(mobj)
1448
1449         # Choose the lowest of the sort, i.e. highest resolution
1450         video_url = links[-1]
1451         # Only get the url. The resolution part in the tuple has no use anymore
1452         video_url = video_url[-1]
1453         # Treat escaped \u0026 style hex
1454         try:
1455             video_url = video_url.decode("unicode_escape")
1456         except AttributeError: # Python 3
1457             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1458
1459
1460         return [{
1461             'id':       video_id,
1462             'url':      video_url,
1463             'uploader': uploader,
1464             'upload_date':  upload_date,
1465             'title':    video_title,
1466             'ext':      video_extension,
1467         }]
1468
1469 class NBAIE(InfoExtractor):
1470     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1471     IE_NAME = u'nba'
1472
1473     def _real_extract(self, url):
1474         mobj = re.match(self._VALID_URL, url)
1475         if mobj is None:
1476             raise ExtractorError(u'Invalid URL: %s' % url)
1477
1478         video_id = mobj.group(1)
1479
1480         webpage = self._download_webpage(url, video_id)
1481
1482         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1483
1484         shortened_video_id = video_id.rpartition('/')[2]
1485         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1486             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1487
1488         # It isn't there in the HTML it returns to us
1489         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1490
1491         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1492
1493         info = {
1494             'id': shortened_video_id,
1495             'url': video_url,
1496             'ext': 'mp4',
1497             'title': title,
1498             # 'uploader_date': uploader_date,
1499             'description': description,
1500         }
1501         return [info]
1502
1503 class JustinTVIE(InfoExtractor):
1504     """Information extractor for justin.tv and twitch.tv"""
1505     # TODO: One broadcast may be split into multiple videos. The key
1506     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1507     # starts at 1 and increases. Can we treat all parts as one video?
1508
1509     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1510         (?:
1511             (?P<channelid>[^/]+)|
1512             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1513             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1514         )
1515         /?(?:\#.*)?$
1516         """
1517     _JUSTIN_PAGE_LIMIT = 100
1518     IE_NAME = u'justin.tv'
1519
1520     def report_download_page(self, channel, offset):
1521         """Report attempt to download a single page of videos."""
1522         self.to_screen(u'%s: Downloading video information from %d to %d' %
1523                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1524
1525     # Return count of items, list of *valid* items
1526     def _parse_page(self, url, video_id):
1527         webpage = self._download_webpage(url, video_id,
1528                                          u'Downloading video info JSON',
1529                                          u'unable to download video info JSON')
1530
1531         response = json.loads(webpage)
1532         if type(response) != list:
1533             error_text = response.get('error', 'unknown error')
1534             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1535         info = []
1536         for clip in response:
1537             video_url = clip['video_file_url']
1538             if video_url:
1539                 video_extension = os.path.splitext(video_url)[1][1:]
1540                 video_date = re.sub('-', '', clip['start_time'][:10])
1541                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1542                 video_id = clip['id']
1543                 video_title = clip.get('title', video_id)
1544                 info.append({
1545                     'id': video_id,
1546                     'url': video_url,
1547                     'title': video_title,
1548                     'uploader': clip.get('channel_name', video_uploader_id),
1549                     'uploader_id': video_uploader_id,
1550                     'upload_date': video_date,
1551                     'ext': video_extension,
1552                 })
1553         return (len(response), info)
1554
1555     def _real_extract(self, url):
1556         mobj = re.match(self._VALID_URL, url)
1557         if mobj is None:
1558             raise ExtractorError(u'invalid URL: %s' % url)
1559
1560         api_base = 'http://api.justin.tv'
1561         paged = False
1562         if mobj.group('channelid'):
1563             paged = True
1564             video_id = mobj.group('channelid')
1565             api = api_base + '/channel/archives/%s.json' % video_id
1566         elif mobj.group('chapterid'):
1567             chapter_id = mobj.group('chapterid')
1568
1569             webpage = self._download_webpage(url, chapter_id)
1570             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1571             if not m:
1572                 raise ExtractorError(u'Cannot find archive of a chapter')
1573             archive_id = m.group(1)
1574
1575             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1576             chapter_info_xml = self._download_webpage(api, chapter_id,
1577                                              note=u'Downloading chapter information',
1578                                              errnote=u'Chapter information download failed')
1579             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1580             for a in doc.findall('.//archive'):
1581                 if archive_id == a.find('./id').text:
1582                     break
1583             else:
1584                 raise ExtractorError(u'Could not find chapter in chapter information')
1585
1586             video_url = a.find('./video_file_url').text
1587             video_ext = video_url.rpartition('.')[2] or u'flv'
1588
1589             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1590             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1591                                    note='Downloading chapter metadata',
1592                                    errnote='Download of chapter metadata failed')
1593             chapter_info = json.loads(chapter_info_json)
1594
1595             bracket_start = int(doc.find('.//bracket_start').text)
1596             bracket_end = int(doc.find('.//bracket_end').text)
1597
1598             # TODO determine start (and probably fix up file)
1599             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1600             #video_url += u'?start=' + TODO:start_timestamp
1601             # bracket_start is 13290, but we want 51670615
1602             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1603                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1604
1605             info = {
1606                 'id': u'c' + chapter_id,
1607                 'url': video_url,
1608                 'ext': video_ext,
1609                 'title': chapter_info['title'],
1610                 'thumbnail': chapter_info['preview'],
1611                 'description': chapter_info['description'],
1612                 'uploader': chapter_info['channel']['display_name'],
1613                 'uploader_id': chapter_info['channel']['name'],
1614             }
1615             return [info]
1616         else:
1617             video_id = mobj.group('videoid')
1618             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1619
1620         self.report_extraction(video_id)
1621
1622         info = []
1623         offset = 0
1624         limit = self._JUSTIN_PAGE_LIMIT
1625         while True:
1626             if paged:
1627                 self.report_download_page(video_id, offset)
1628             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1629             page_count, page_info = self._parse_page(page_url, video_id)
1630             info.extend(page_info)
1631             if not paged or page_count != limit:
1632                 break
1633             offset += limit
1634         return info
1635
1636 class FunnyOrDieIE(InfoExtractor):
1637     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1638
1639     def _real_extract(self, url):
1640         mobj = re.match(self._VALID_URL, url)
1641         if mobj is None:
1642             raise ExtractorError(u'invalid URL: %s' % url)
1643
1644         video_id = mobj.group('id')
1645         webpage = self._download_webpage(url, video_id)
1646
1647         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1648             webpage, u'video URL', flags=re.DOTALL)
1649
1650         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1651             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1652
1653         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1654             webpage, u'description', fatal=False, flags=re.DOTALL)
1655
1656         info = {
1657             'id': video_id,
1658             'url': video_url,
1659             'ext': 'mp4',
1660             'title': title,
1661             'description': video_description,
1662         }
1663         return [info]
1664
1665 class SteamIE(InfoExtractor):
1666     _VALID_URL = r"""http://store\.steampowered\.com/
1667                 (agecheck/)?
1668                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1669                 (?P<gameID>\d+)/?
1670                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1671                 """
1672     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1673     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1674
1675     @classmethod
1676     def suitable(cls, url):
1677         """Receives a URL and returns True if suitable for this IE."""
1678         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1679
1680     def _real_extract(self, url):
1681         m = re.match(self._VALID_URL, url, re.VERBOSE)
1682         gameID = m.group('gameID')
1683
1684         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1685         webpage = self._download_webpage(videourl, gameID)
1686
1687         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1688             videourl = self._AGECHECK_TEMPLATE % gameID
1689             self.report_age_confirmation()
1690             webpage = self._download_webpage(videourl, gameID)
1691
1692         self.report_extraction(gameID)
1693         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1694                                              webpage, 'game title')
1695
1696         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1697         mweb = re.finditer(urlRE, webpage)
1698         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1699         titles = re.finditer(namesRE, webpage)
1700         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1701         thumbs = re.finditer(thumbsRE, webpage)
1702         videos = []
1703         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1704             video_id = vid.group('videoID')
1705             title = vtitle.group('videoName')
1706             video_url = vid.group('videoURL')
1707             video_thumb = thumb.group('thumbnail')
1708             if not video_url:
1709                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1710             info = {
1711                 'id':video_id,
1712                 'url':video_url,
1713                 'ext': 'flv',
1714                 'title': unescapeHTML(title),
1715                 'thumbnail': video_thumb
1716                   }
1717             videos.append(info)
1718         return [self.playlist_result(videos, gameID, game_title)]
1719
1720 class UstreamIE(InfoExtractor):
1721     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1722     IE_NAME = u'ustream'
1723
1724     def _real_extract(self, url):
1725         m = re.match(self._VALID_URL, url)
1726         video_id = m.group('videoID')
1727
1728         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1729         webpage = self._download_webpage(url, video_id)
1730
1731         self.report_extraction(video_id)
1732
1733         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1734             webpage, u'title')
1735
1736         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1737             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1738
1739         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1740             webpage, u'thumbnail', fatal=False)
1741
1742         info = {
1743                 'id': video_id,
1744                 'url': video_url,
1745                 'ext': 'flv',
1746                 'title': video_title,
1747                 'uploader': uploader,
1748                 'thumbnail': thumbnail,
1749                }
1750         return info
1751
1752 class WorldStarHipHopIE(InfoExtractor):
1753     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1754     IE_NAME = u'WorldStarHipHop'
1755
1756     def _real_extract(self, url):
1757         m = re.match(self._VALID_URL, url)
1758         video_id = m.group('id')
1759
1760         webpage_src = self._download_webpage(url, video_id)
1761
1762         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1763             webpage_src, u'video URL')
1764
1765         if 'mp4' in video_url:
1766             ext = 'mp4'
1767         else:
1768             ext = 'flv'
1769
1770         video_title = self._html_search_regex(r"<title>(.*)</title>",
1771             webpage_src, u'title')
1772
1773         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1774         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1775             webpage_src, u'thumbnail', fatal=False)
1776
1777         if not thumbnail:
1778             _title = r"""candytitles.*>(.*)</span>"""
1779             mobj = re.search(_title, webpage_src)
1780             if mobj is not None:
1781                 video_title = mobj.group(1)
1782
1783         results = [{
1784                     'id': video_id,
1785                     'url' : video_url,
1786                     'title' : video_title,
1787                     'thumbnail' : thumbnail,
1788                     'ext' : ext,
1789                     }]
1790         return results
1791
1792 class RBMARadioIE(InfoExtractor):
1793     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1794
1795     def _real_extract(self, url):
1796         m = re.match(self._VALID_URL, url)
1797         video_id = m.group('videoID')
1798
1799         webpage = self._download_webpage(url, video_id)
1800
1801         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1802             webpage, u'json data', flags=re.MULTILINE)
1803
1804         try:
1805             data = json.loads(json_data)
1806         except ValueError as e:
1807             raise ExtractorError(u'Invalid JSON: ' + str(e))
1808
1809         video_url = data['akamai_url'] + '&cbr=256'
1810         url_parts = compat_urllib_parse_urlparse(video_url)
1811         video_ext = url_parts.path.rpartition('.')[2]
1812         info = {
1813                 'id': video_id,
1814                 'url': video_url,
1815                 'ext': video_ext,
1816                 'title': data['title'],
1817                 'description': data.get('teaser_text'),
1818                 'location': data.get('country_of_origin'),
1819                 'uploader': data.get('host', {}).get('name'),
1820                 'uploader_id': data.get('host', {}).get('slug'),
1821                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1822                 'duration': data.get('duration'),
1823         }
1824         return [info]
1825
1826
1827 class YouPornIE(InfoExtractor):
1828     """Information extractor for youporn.com."""
1829     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1830
1831     def _print_formats(self, formats):
1832         """Print all available formats"""
1833         print(u'Available formats:')
1834         print(u'ext\t\tformat')
1835         print(u'---------------------------------')
1836         for format in formats:
1837             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1838
1839     def _specific(self, req_format, formats):
1840         for x in formats:
1841             if(x["format"]==req_format):
1842                 return x
1843         return None
1844
1845     def _real_extract(self, url):
1846         mobj = re.match(self._VALID_URL, url)
1847         if mobj is None:
1848             raise ExtractorError(u'Invalid URL: %s' % url)
1849         video_id = mobj.group('videoid')
1850
1851         req = compat_urllib_request.Request(url)
1852         req.add_header('Cookie', 'age_verified=1')
1853         webpage = self._download_webpage(req, video_id)
1854
1855         # Get JSON parameters
1856         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1857         try:
1858             params = json.loads(json_params)
1859         except:
1860             raise ExtractorError(u'Invalid JSON')
1861
1862         self.report_extraction(video_id)
1863         try:
1864             video_title = params['title']
1865             upload_date = unified_strdate(params['release_date_f'])
1866             video_description = params['description']
1867             video_uploader = params['submitted_by']
1868             thumbnail = params['thumbnails'][0]['image']
1869         except KeyError:
1870             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1871
1872         # Get all of the formats available
1873         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1874         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1875             webpage, u'download list').strip()
1876
1877         # Get all of the links from the page
1878         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1879         links = re.findall(LINK_RE, download_list_html)
1880         if(len(links) == 0):
1881             raise ExtractorError(u'ERROR: no known formats available for video')
1882
1883         self.to_screen(u'Links found: %d' % len(links))
1884
1885         formats = []
1886         for link in links:
1887
1888             # A link looks like this:
1889             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1890             # A path looks like this:
1891             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1892             video_url = unescapeHTML( link )
1893             path = compat_urllib_parse_urlparse( video_url ).path
1894             extension = os.path.splitext( path )[1][1:]
1895             format = path.split('/')[4].split('_')[:2]
1896             size = format[0]
1897             bitrate = format[1]
1898             format = "-".join( format )
1899             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1900
1901             formats.append({
1902                 'id': video_id,
1903                 'url': video_url,
1904                 'uploader': video_uploader,
1905                 'upload_date': upload_date,
1906                 'title': video_title,
1907                 'ext': extension,
1908                 'format': format,
1909                 'thumbnail': thumbnail,
1910                 'description': video_description
1911             })
1912
1913         if self._downloader.params.get('listformats', None):
1914             self._print_formats(formats)
1915             return
1916
1917         req_format = self._downloader.params.get('format', None)
1918         self.to_screen(u'Format: %s' % req_format)
1919
1920         if req_format is None or req_format == 'best':
1921             return [formats[0]]
1922         elif req_format == 'worst':
1923             return [formats[-1]]
1924         elif req_format in ('-1', 'all'):
1925             return formats
1926         else:
1927             format = self._specific( req_format, formats )
1928             if result is None:
1929                 raise ExtractorError(u'Requested format not available')
1930             return [format]
1931
1932
1933
1934 class PornotubeIE(InfoExtractor):
1935     """Information extractor for pornotube.com."""
1936     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1937
1938     def _real_extract(self, url):
1939         mobj = re.match(self._VALID_URL, url)
1940         if mobj is None:
1941             raise ExtractorError(u'Invalid URL: %s' % url)
1942
1943         video_id = mobj.group('videoid')
1944         video_title = mobj.group('title')
1945
1946         # Get webpage content
1947         webpage = self._download_webpage(url, video_id)
1948
1949         # Get the video URL
1950         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1951         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1952         video_url = compat_urllib_parse.unquote(video_url)
1953
1954         #Get the uploaded date
1955         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1956         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1957         if upload_date: upload_date = unified_strdate(upload_date)
1958
1959         info = {'id': video_id,
1960                 'url': video_url,
1961                 'uploader': None,
1962                 'upload_date': upload_date,
1963                 'title': video_title,
1964                 'ext': 'flv',
1965                 'format': 'flv'}
1966
1967         return [info]
1968
1969 class YouJizzIE(InfoExtractor):
1970     """Information extractor for youjizz.com."""
1971     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1972
1973     def _real_extract(self, url):
1974         mobj = re.match(self._VALID_URL, url)
1975         if mobj is None:
1976             raise ExtractorError(u'Invalid URL: %s' % url)
1977
1978         video_id = mobj.group('videoid')
1979
1980         # Get webpage content
1981         webpage = self._download_webpage(url, video_id)
1982
1983         # Get the video title
1984         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1985             webpage, u'title').strip()
1986
1987         # Get the embed page
1988         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1989         if result is None:
1990             raise ExtractorError(u'ERROR: unable to extract embed page')
1991
1992         embed_page_url = result.group(0).strip()
1993         video_id = result.group('videoid')
1994
1995         webpage = self._download_webpage(embed_page_url, video_id)
1996
1997         # Get the video URL
1998         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1999             webpage, u'video URL')
2000
2001         info = {'id': video_id,
2002                 'url': video_url,
2003                 'title': video_title,
2004                 'ext': 'flv',
2005                 'format': 'flv',
2006                 'player_url': embed_page_url}
2007
2008         return [info]
2009
2010 class EightTracksIE(InfoExtractor):
2011     IE_NAME = '8tracks'
2012     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2013
2014     def _real_extract(self, url):
2015         mobj = re.match(self._VALID_URL, url)
2016         if mobj is None:
2017             raise ExtractorError(u'Invalid URL: %s' % url)
2018         playlist_id = mobj.group('id')
2019
2020         webpage = self._download_webpage(url, playlist_id)
2021
2022         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2023         data = json.loads(json_like)
2024
2025         session = str(random.randint(0, 1000000000))
2026         mix_id = data['id']
2027         track_count = data['tracks_count']
2028         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2029         next_url = first_url
2030         res = []
2031         for i in itertools.count():
2032             api_json = self._download_webpage(next_url, playlist_id,
2033                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2034                 errnote=u'Failed to download song information')
2035             api_data = json.loads(api_json)
2036             track_data = api_data[u'set']['track']
2037             info = {
2038                 'id': track_data['id'],
2039                 'url': track_data['track_file_stream_url'],
2040                 'title': track_data['performer'] + u' - ' + track_data['name'],
2041                 'raw_title': track_data['name'],
2042                 'uploader_id': data['user']['login'],
2043                 'ext': 'm4a',
2044             }
2045             res.append(info)
2046             if api_data['set']['at_last_track']:
2047                 break
2048             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2049         return res
2050
2051 class KeekIE(InfoExtractor):
2052     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2053     IE_NAME = u'keek'
2054
2055     def _real_extract(self, url):
2056         m = re.match(self._VALID_URL, url)
2057         video_id = m.group('videoID')
2058
2059         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2060         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2061         webpage = self._download_webpage(url, video_id)
2062
2063         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2064             webpage, u'title')
2065
2066         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2067             webpage, u'uploader', fatal=False)
2068
2069         info = {
2070                 'id': video_id,
2071                 'url': video_url,
2072                 'ext': 'mp4',
2073                 'title': video_title,
2074                 'thumbnail': thumbnail,
2075                 'uploader': uploader
2076         }
2077         return [info]
2078
2079 class TEDIE(InfoExtractor):
2080     _VALID_URL=r'''http://www\.ted\.com/
2081                    (
2082                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2083                         |
2084                         ((?P<type_talk>talks)) # We have a simple talk
2085                    )
2086                    (/lang/(.*?))? # The url may contain the language
2087                    /(?P<name>\w+) # Here goes the name and then ".html"
2088                    '''
2089
2090     @classmethod
2091     def suitable(cls, url):
2092         """Receives a URL and returns True if suitable for this IE."""
2093         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2094
2095     def _real_extract(self, url):
2096         m=re.match(self._VALID_URL, url, re.VERBOSE)
2097         if m.group('type_talk'):
2098             return [self._talk_info(url)]
2099         else :
2100             playlist_id=m.group('playlist_id')
2101             name=m.group('name')
2102             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2103             return [self._playlist_videos_info(url,name,playlist_id)]
2104
2105     def _playlist_videos_info(self,url,name,playlist_id=0):
2106         '''Returns the videos of the playlist'''
2107         video_RE=r'''
2108                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2109                      ([.\s]*?)data-playlist_item_id="(\d+)"
2110                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2111                      '''
2112         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2113         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2114         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2115         m_names=re.finditer(video_name_RE,webpage)
2116
2117         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2118                                                  webpage, 'playlist title')
2119
2120         playlist_entries = []
2121         for m_video, m_name in zip(m_videos,m_names):
2122             video_id=m_video.group('video_id')
2123             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2124             playlist_entries.append(self.url_result(talk_url, 'TED'))
2125         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2126
2127     def _talk_info(self, url, video_id=0):
2128         """Return the video for the talk in the url"""
2129         m = re.match(self._VALID_URL, url,re.VERBOSE)
2130         video_name = m.group('name')
2131         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2132         self.report_extraction(video_name)
2133         # If the url includes the language we get the title translated
2134         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2135                                         webpage, 'title')
2136         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2137                                     webpage, 'json data')
2138         info = json.loads(json_data)
2139         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2140                                        webpage, 'description', flags = re.DOTALL)
2141
2142         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2143                                        webpage, 'thumbnail')
2144         info = {
2145                 'id': info['id'],
2146                 'url': info['htmlStreams'][-1]['file'],
2147                 'ext': 'mp4',
2148                 'title': title,
2149                 'thumbnail': thumbnail,
2150                 'description': desc,
2151                 }
2152         return info
2153
2154 class MySpassIE(InfoExtractor):
2155     _VALID_URL = r'http://www.myspass.de/.*'
2156
2157     def _real_extract(self, url):
2158         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2159
2160         # video id is the last path element of the URL
2161         # usually there is a trailing slash, so also try the second but last
2162         url_path = compat_urllib_parse_urlparse(url).path
2163         url_parent_path, video_id = os.path.split(url_path)
2164         if not video_id:
2165             _, video_id = os.path.split(url_parent_path)
2166
2167         # get metadata
2168         metadata_url = META_DATA_URL_TEMPLATE % video_id
2169         metadata_text = self._download_webpage(metadata_url, video_id)
2170         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2171
2172         # extract values from metadata
2173         url_flv_el = metadata.find('url_flv')
2174         if url_flv_el is None:
2175             raise ExtractorError(u'Unable to extract download url')
2176         video_url = url_flv_el.text
2177         extension = os.path.splitext(video_url)[1][1:]
2178         title_el = metadata.find('title')
2179         if title_el is None:
2180             raise ExtractorError(u'Unable to extract title')
2181         title = title_el.text
2182         format_id_el = metadata.find('format_id')
2183         if format_id_el is None:
2184             format = ext
2185         else:
2186             format = format_id_el.text
2187         description_el = metadata.find('description')
2188         if description_el is not None:
2189             description = description_el.text
2190         else:
2191             description = None
2192         imagePreview_el = metadata.find('imagePreview')
2193         if imagePreview_el is not None:
2194             thumbnail = imagePreview_el.text
2195         else:
2196             thumbnail = None
2197         info = {
2198             'id': video_id,
2199             'url': video_url,
2200             'title': title,
2201             'ext': extension,
2202             'format': format,
2203             'thumbnail': thumbnail,
2204             'description': description
2205         }
2206         return [info]
2207
2208 class SpiegelIE(InfoExtractor):
2209     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2210
2211     def _real_extract(self, url):
2212         m = re.match(self._VALID_URL, url)
2213         video_id = m.group('videoID')
2214
2215         webpage = self._download_webpage(url, video_id)
2216
2217         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2218             webpage, u'title')
2219
2220         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2221         xml_code = self._download_webpage(xml_url, video_id,
2222                     note=u'Downloading XML', errnote=u'Failed to download XML')
2223
2224         idoc = xml.etree.ElementTree.fromstring(xml_code)
2225         last_type = idoc[-1]
2226         filename = last_type.findall('./filename')[0].text
2227         duration = float(last_type.findall('./duration')[0].text)
2228
2229         video_url = 'http://video2.spiegel.de/flash/' + filename
2230         video_ext = filename.rpartition('.')[2]
2231         info = {
2232             'id': video_id,
2233             'url': video_url,
2234             'ext': video_ext,
2235             'title': video_title,
2236             'duration': duration,
2237         }
2238         return [info]
2239
2240 class LiveLeakIE(InfoExtractor):
2241
2242     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2243     IE_NAME = u'liveleak'
2244
2245     def _real_extract(self, url):
2246         mobj = re.match(self._VALID_URL, url)
2247         if mobj is None:
2248             raise ExtractorError(u'Invalid URL: %s' % url)
2249
2250         video_id = mobj.group('video_id')
2251
2252         webpage = self._download_webpage(url, video_id)
2253
2254         video_url = self._search_regex(r'file: "(.*?)",',
2255             webpage, u'video URL')
2256
2257         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2258             webpage, u'title').replace('LiveLeak.com -', '').strip()
2259
2260         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2261             webpage, u'description', fatal=False)
2262
2263         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2264             webpage, u'uploader', fatal=False)
2265
2266         info = {
2267             'id':  video_id,
2268             'url': video_url,
2269             'ext': 'mp4',
2270             'title': video_title,
2271             'description': video_description,
2272             'uploader': video_uploader
2273         }
2274
2275         return [info]
2276
2277
2278
2279 class TumblrIE(InfoExtractor):
2280     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2281
2282     def _real_extract(self, url):
2283         m_url = re.match(self._VALID_URL, url)
2284         video_id = m_url.group('id')
2285         blog = m_url.group('blog_name')
2286
2287         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2288         webpage = self._download_webpage(url, video_id)
2289
2290         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2291         video = re.search(re_video, webpage)
2292         if video is None:
2293            raise ExtractorError(u'Unable to extract video')
2294         video_url = video.group('video_url')
2295         ext = video.group('ext')
2296
2297         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2298             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2299         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2300
2301         # The only place where you can get a title, it's not complete,
2302         # but searching in other places doesn't work for all videos
2303         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2304             webpage, u'title', flags=re.DOTALL)
2305
2306         return [{'id': video_id,
2307                  'url': video_url,
2308                  'title': video_title,
2309                  'thumbnail': video_thumbnail,
2310                  'ext': ext
2311                  }]
2312
2313 class BandcampIE(InfoExtractor):
2314     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2315
2316     def _real_extract(self, url):
2317         mobj = re.match(self._VALID_URL, url)
2318         title = mobj.group('title')
2319         webpage = self._download_webpage(url, title)
2320         # We get the link to the free download page
2321         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2322         if m_download is None:
2323             raise ExtractorError(u'No free songs found')
2324
2325         download_link = m_download.group(1)
2326         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2327                        webpage, re.MULTILINE|re.DOTALL).group('id')
2328
2329         download_webpage = self._download_webpage(download_link, id,
2330                                                   'Downloading free downloads page')
2331         # We get the dictionary of the track from some javascrip code
2332         info = re.search(r'items: (.*?),$',
2333                          download_webpage, re.MULTILINE).group(1)
2334         info = json.loads(info)[0]
2335         # We pick mp3-320 for now, until format selection can be easily implemented.
2336         mp3_info = info[u'downloads'][u'mp3-320']
2337         # If we try to use this url it says the link has expired
2338         initial_url = mp3_info[u'url']
2339         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2340         m_url = re.match(re_url, initial_url)
2341         #We build the url we will use to get the final track url
2342         # This url is build in Bandcamp in the script download_bunde_*.js
2343         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2344         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2345         # If we could correctly generate the .rand field the url would be
2346         #in the "download_url" key
2347         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2348
2349         track_info = {'id':id,
2350                       'title' : info[u'title'],
2351                       'ext' :   'mp3',
2352                       'url' :   final_url,
2353                       'thumbnail' : info[u'thumb_url'],
2354                       'uploader' :  info[u'artist']
2355                       }
2356
2357         return [track_info]
2358
2359 class RedTubeIE(InfoExtractor):
2360     """Information Extractor for redtube"""
2361     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2362
2363     def _real_extract(self,url):
2364         mobj = re.match(self._VALID_URL, url)
2365         if mobj is None:
2366             raise ExtractorError(u'Invalid URL: %s' % url)
2367
2368         video_id = mobj.group('id')
2369         video_extension = 'mp4'
2370         webpage = self._download_webpage(url, video_id)
2371
2372         self.report_extraction(video_id)
2373
2374         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2375             webpage, u'video URL')
2376
2377         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2378             webpage, u'title')
2379
2380         return [{
2381             'id':       video_id,
2382             'url':      video_url,
2383             'ext':      video_extension,
2384             'title':    video_title,
2385         }]
2386
2387 class InaIE(InfoExtractor):
2388     """Information Extractor for Ina.fr"""
2389     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2390
2391     def _real_extract(self,url):
2392         mobj = re.match(self._VALID_URL, url)
2393
2394         video_id = mobj.group('id')
2395         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2396         video_extension = 'mp4'
2397         webpage = self._download_webpage(mrss_url, video_id)
2398
2399         self.report_extraction(video_id)
2400
2401         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2402             webpage, u'video URL')
2403
2404         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2405             webpage, u'title')
2406
2407         return [{
2408             'id':       video_id,
2409             'url':      video_url,
2410             'ext':      video_extension,
2411             'title':    video_title,
2412         }]
2413
2414 class HowcastIE(InfoExtractor):
2415     """Information Extractor for Howcast.com"""
2416     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2417
2418     def _real_extract(self, url):
2419         mobj = re.match(self._VALID_URL, url)
2420
2421         video_id = mobj.group('id')
2422         webpage_url = 'http://www.howcast.com/videos/' + video_id
2423         webpage = self._download_webpage(webpage_url, video_id)
2424
2425         self.report_extraction(video_id)
2426
2427         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2428             webpage, u'video URL')
2429
2430         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2431             webpage, u'title')
2432
2433         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2434             webpage, u'description', fatal=False)
2435
2436         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2437             webpage, u'thumbnail', fatal=False)
2438
2439         return [{
2440             'id':       video_id,
2441             'url':      video_url,
2442             'ext':      'mp4',
2443             'title':    video_title,
2444             'description': video_description,
2445             'thumbnail': thumbnail,
2446         }]
2447
2448 class VineIE(InfoExtractor):
2449     """Information Extractor for Vine.co"""
2450     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2451
2452     def _real_extract(self, url):
2453         mobj = re.match(self._VALID_URL, url)
2454
2455         video_id = mobj.group('id')
2456         webpage_url = 'https://vine.co/v/' + video_id
2457         webpage = self._download_webpage(webpage_url, video_id)
2458
2459         self.report_extraction(video_id)
2460
2461         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2462             webpage, u'video URL')
2463
2464         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2465             webpage, u'title')
2466
2467         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2468             webpage, u'thumbnail', fatal=False)
2469
2470         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2471             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2472
2473         return [{
2474             'id':        video_id,
2475             'url':       video_url,
2476             'ext':       'mp4',
2477             'title':     video_title,
2478             'thumbnail': thumbnail,
2479             'uploader':  uploader,
2480         }]
2481
2482 class FlickrIE(InfoExtractor):
2483     """Information Extractor for Flickr videos"""
2484     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2485
2486     def _real_extract(self, url):
2487         mobj = re.match(self._VALID_URL, url)
2488
2489         video_id = mobj.group('id')
2490         video_uploader_id = mobj.group('uploader_id')
2491         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2492         webpage = self._download_webpage(webpage_url, video_id)
2493
2494         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2495
2496         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2497         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2498
2499         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2500             first_xml, u'node_id')
2501
2502         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2503         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2504
2505         self.report_extraction(video_id)
2506
2507         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2508         if mobj is None:
2509             raise ExtractorError(u'Unable to extract video url')
2510         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2511
2512         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2513             webpage, u'video title')
2514
2515         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2516             webpage, u'description', fatal=False)
2517
2518         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2519             webpage, u'thumbnail', fatal=False)
2520
2521         return [{
2522             'id':          video_id,
2523             'url':         video_url,
2524             'ext':         'mp4',
2525             'title':       video_title,
2526             'description': video_description,
2527             'thumbnail':   thumbnail,
2528             'uploader_id': video_uploader_id,
2529         }]
2530
2531 class TeamcocoIE(InfoExtractor):
2532     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2533
2534     def _real_extract(self, url):
2535         mobj = re.match(self._VALID_URL, url)
2536         if mobj is None:
2537             raise ExtractorError(u'Invalid URL: %s' % url)
2538         url_title = mobj.group('url_title')
2539         webpage = self._download_webpage(url, url_title)
2540
2541         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2542             webpage, u'video id')
2543
2544         self.report_extraction(video_id)
2545
2546         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2547             webpage, u'title')
2548
2549         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2550             webpage, u'thumbnail', fatal=False)
2551
2552         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2553             webpage, u'description', fatal=False)
2554
2555         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2556         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2557
2558         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2559             data, u'video URL')
2560
2561         return [{
2562             'id':          video_id,
2563             'url':         video_url,
2564             'ext':         'mp4',
2565             'title':       video_title,
2566             'thumbnail':   thumbnail,
2567             'description': video_description,
2568         }]
2569
2570 class XHamsterIE(InfoExtractor):
2571     """Information Extractor for xHamster"""
2572     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2573
2574     def _real_extract(self,url):
2575         mobj = re.match(self._VALID_URL, url)
2576
2577         video_id = mobj.group('id')
2578         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2579         webpage = self._download_webpage(mrss_url, video_id)
2580
2581         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2582         if mobj is None:
2583             raise ExtractorError(u'Unable to extract media URL')
2584         if len(mobj.group('server')) == 0:
2585             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2586         else:
2587             video_url = mobj.group('server')+'/key='+mobj.group('file')
2588         video_extension = video_url.split('.')[-1]
2589
2590         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2591             webpage, u'title')
2592
2593         # Can't see the description anywhere in the UI
2594         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2595         #     webpage, u'description', fatal=False)
2596         # if video_description: video_description = unescapeHTML(video_description)
2597
2598         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2599         if mobj:
2600             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2601         else:
2602             video_upload_date = None
2603             self._downloader.report_warning(u'Unable to extract upload date')
2604
2605         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2606             webpage, u'uploader id', default=u'anonymous')
2607
2608         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2609             webpage, u'thumbnail', fatal=False)
2610
2611         return [{
2612             'id':       video_id,
2613             'url':      video_url,
2614             'ext':      video_extension,
2615             'title':    video_title,
2616             # 'description': video_description,
2617             'upload_date': video_upload_date,
2618             'uploader_id': video_uploader_id,
2619             'thumbnail': video_thumbnail
2620         }]
2621
2622 class HypemIE(InfoExtractor):
2623     """Information Extractor for hypem"""
2624     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2625
2626     def _real_extract(self, url):
2627         mobj = re.match(self._VALID_URL, url)
2628         if mobj is None:
2629             raise ExtractorError(u'Invalid URL: %s' % url)
2630         track_id = mobj.group(1)
2631
2632         data = { 'ax': 1, 'ts': time.time() }
2633         data_encoded = compat_urllib_parse.urlencode(data)
2634         complete_url = url + "?" + data_encoded
2635         request = compat_urllib_request.Request(complete_url)
2636         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2637         cookie = urlh.headers.get('Set-Cookie', '')
2638
2639         self.report_extraction(track_id)
2640
2641         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2642             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2643         try:
2644             track_list = json.loads(html_tracks)
2645             track = track_list[u'tracks'][0]
2646         except ValueError:
2647             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2648
2649         key = track[u"key"]
2650         track_id = track[u"id"]
2651         artist = track[u"artist"]
2652         title = track[u"song"]
2653
2654         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2655         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2656         request.add_header('cookie', cookie)
2657         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2658         try:
2659             song_data = json.loads(song_data_json)
2660         except ValueError:
2661             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2662         final_url = song_data[u"url"]
2663
2664         return [{
2665             'id':       track_id,
2666             'url':      final_url,
2667             'ext':      "mp3",
2668             'title':    title,
2669             'artist':   artist,
2670         }]
2671
2672 class Vbox7IE(InfoExtractor):
2673     """Information Extractor for Vbox7"""
2674     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2675
2676     def _real_extract(self,url):
2677         mobj = re.match(self._VALID_URL, url)
2678         if mobj is None:
2679             raise ExtractorError(u'Invalid URL: %s' % url)
2680         video_id = mobj.group(1)
2681
2682         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2683         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2684         redirect_url = urlh.geturl() + new_location
2685         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2686
2687         title = self._html_search_regex(r'<title>(.*)</title>',
2688             webpage, u'title').split('/')[0].strip()
2689
2690         ext = "flv"
2691         info_url = "http://vbox7.com/play/magare.do"
2692         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2693         info_request = compat_urllib_request.Request(info_url, data)
2694         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2695         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2696         if info_response is None:
2697             raise ExtractorError(u'Unable to extract the media url')
2698         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2699
2700         return [{
2701             'id':        video_id,
2702             'url':       final_url,
2703             'ext':       ext,
2704             'title':     title,
2705             'thumbnail': thumbnail_url,
2706         }]
2707
2708
2709 def gen_extractors():
2710     """ Return a list of an instance of every supported extractor.
2711     The order does matter; the first extractor matched is the one handling the URL.
2712     """
2713     return [
2714         YoutubePlaylistIE(),
2715         YoutubeChannelIE(),
2716         YoutubeUserIE(),
2717         YoutubeSearchIE(),
2718         YoutubeIE(),
2719         MetacafeIE(),
2720         DailymotionIE(),
2721         GoogleSearchIE(),
2722         PhotobucketIE(),
2723         YahooIE(),
2724         YahooSearchIE(),
2725         DepositFilesIE(),
2726         FacebookIE(),
2727         BlipTVIE(),
2728         BlipTVUserIE(),
2729         VimeoIE(),
2730         MyVideoIE(),
2731         ComedyCentralIE(),
2732         EscapistIE(),
2733         CollegeHumorIE(),
2734         XVideosIE(),
2735         SoundcloudSetIE(),
2736         SoundcloudIE(),
2737         InfoQIE(),
2738         MixcloudIE(),
2739         StanfordOpenClassroomIE(),
2740         MTVIE(),
2741         YoukuIE(),
2742         XNXXIE(),
2743         YouJizzIE(),
2744         PornotubeIE(),
2745         YouPornIE(),
2746         GooglePlusIE(),
2747         ArteTvIE(),
2748         NBAIE(),
2749         WorldStarHipHopIE(),
2750         JustinTVIE(),
2751         FunnyOrDieIE(),
2752         SteamIE(),
2753         UstreamIE(),
2754         RBMARadioIE(),
2755         EightTracksIE(),
2756         KeekIE(),
2757         TEDIE(),
2758         MySpassIE(),
2759         SpiegelIE(),
2760         LiveLeakIE(),
2761         ARDIE(),
2762         ZDFIE(),
2763         TumblrIE(),
2764         BandcampIE(),
2765         RedTubeIE(),
2766         InaIE(),
2767         HowcastIE(),
2768         VineIE(),
2769         FlickrIE(),
2770         TeamcocoIE(),
2771         XHamsterIE(),
2772         HypemIE(),
2773         Vbox7IE(),
2774         GametrailersIE(),
2775         StatigramIE(),
2776         GenericIE()
2777     ]
2778
2779 def get_info_extractor(ie_name):
2780     """Returns the info extractor class with the given ie_name"""
2781     return globals()[ie_name+'IE']