youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.infoq import InfoQIE
  35 from .extractor.metacafe import MetacafeIE
  36 from .extractor.mixcloud import MixcloudIE
  37 from .extractor.mtv import MTVIE
  38 from .extractor.myvideo import MyVideoIE
  39 from .extractor.nba import NBAIE
  40 from .extractor.statigram import StatigramIE
  41 from .extractor.photobucket import PhotobucketIE
  42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  43 from .extractor.stanfordoc import StanfordOpenClassroomIE
  44 from .extractor.ted import TEDIE
  45 from .extractor.vimeo import VimeoIE
  46 from .extractor.xvideos import XVideosIE
  47 from .extractor.yahoo import YahooIE, YahooSearchIE
  48 from .extractor.youku import YoukuIE
  49 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  50 from .extractor.zdf import ZDFIE
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60 class XNXXIE(InfoExtractor):
  61     """Information extractor for xnxx.com"""
  62
  63     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
  64     IE_NAME = u'xnxx'
  65     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
  66     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
  67     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
  68
  69     def _real_extract(self, url):
  70         mobj = re.match(self._VALID_URL, url)
  71         if mobj is None:
  72             raise ExtractorError(u'Invalid URL: %s' % url)
  73         video_id = mobj.group(1)
  74
  75         # Get webpage content
  76         webpage = self._download_webpage(url, video_id)
  77
  78         video_url = self._search_regex(self.VIDEO_URL_RE,
  79             webpage, u'video URL')
  80         video_url = compat_urllib_parse.unquote(video_url)
  81
  82         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
  83             webpage, u'title')
  84
  85         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
  86             webpage, u'thumbnail', fatal=False)
  87
  88         return [{
  89             'id': video_id,
  90             'url': video_url,
  91             'uploader': None,
  92             'upload_date': None,
  93             'title': video_title,
  94             'ext': 'flv',
  95             'thumbnail': video_thumbnail,
  96             'description': None,
  97         }]
  98
  99
 100
 101
 102 class JustinTVIE(InfoExtractor):
 103     """Information extractor for justin.tv and twitch.tv"""
 104     # TODO: One broadcast may be split into multiple videos. The key
 105     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 106     # starts at 1 and increases. Can we treat all parts as one video?
 107
 108     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 109         (?:
 110             (?P<channelid>[^/]+)|
 111             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 112             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 113         )
 114         /?(?:\#.*)?$
 115         """
 116     _JUSTIN_PAGE_LIMIT = 100
 117     IE_NAME = u'justin.tv'
 118
 119     def report_download_page(self, channel, offset):
 120         """Report attempt to download a single page of videos."""
 121         self.to_screen(u'%s: Downloading video information from %d to %d' %
 122                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 123
 124     # Return count of items, list of *valid* items
 125     def _parse_page(self, url, video_id):
 126         webpage = self._download_webpage(url, video_id,
 127                                          u'Downloading video info JSON',
 128                                          u'unable to download video info JSON')
 129
 130         response = json.loads(webpage)
 131         if type(response) != list:
 132             error_text = response.get('error', 'unknown error')
 133             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 134         info = []
 135         for clip in response:
 136             video_url = clip['video_file_url']
 137             if video_url:
 138                 video_extension = os.path.splitext(video_url)[1][1:]
 139                 video_date = re.sub('-', '', clip['start_time'][:10])
 140                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 141                 video_id = clip['id']
 142                 video_title = clip.get('title', video_id)
 143                 info.append({
 144                     'id': video_id,
 145                     'url': video_url,
 146                     'title': video_title,
 147                     'uploader': clip.get('channel_name', video_uploader_id),
 148                     'uploader_id': video_uploader_id,
 149                     'upload_date': video_date,
 150                     'ext': video_extension,
 151                 })
 152         return (len(response), info)
 153
 154     def _real_extract(self, url):
 155         mobj = re.match(self._VALID_URL, url)
 156         if mobj is None:
 157             raise ExtractorError(u'invalid URL: %s' % url)
 158
 159         api_base = 'http://api.justin.tv'
 160         paged = False
 161         if mobj.group('channelid'):
 162             paged = True
 163             video_id = mobj.group('channelid')
 164             api = api_base + '/channel/archives/%s.json' % video_id
 165         elif mobj.group('chapterid'):
 166             chapter_id = mobj.group('chapterid')
 167
 168             webpage = self._download_webpage(url, chapter_id)
 169             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 170             if not m:
 171                 raise ExtractorError(u'Cannot find archive of a chapter')
 172             archive_id = m.group(1)
 173
 174             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 175             chapter_info_xml = self._download_webpage(api, chapter_id,
 176                                              note=u'Downloading chapter information',
 177                                              errnote=u'Chapter information download failed')
 178             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 179             for a in doc.findall('.//archive'):
 180                 if archive_id == a.find('./id').text:
 181                     break
 182             else:
 183                 raise ExtractorError(u'Could not find chapter in chapter information')
 184
 185             video_url = a.find('./video_file_url').text
 186             video_ext = video_url.rpartition('.')[2] or u'flv'
 187
 188             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 189             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 190                                    note='Downloading chapter metadata',
 191                                    errnote='Download of chapter metadata failed')
 192             chapter_info = json.loads(chapter_info_json)
 193
 194             bracket_start = int(doc.find('.//bracket_start').text)
 195             bracket_end = int(doc.find('.//bracket_end').text)
 196
 197             # TODO determine start (and probably fix up file)
 198             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 199             #video_url += u'?start=' + TODO:start_timestamp
 200             # bracket_start is 13290, but we want 51670615
 201             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 202                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 203
 204             info = {
 205                 'id': u'c' + chapter_id,
 206                 'url': video_url,
 207                 'ext': video_ext,
 208                 'title': chapter_info['title'],
 209                 'thumbnail': chapter_info['preview'],
 210                 'description': chapter_info['description'],
 211                 'uploader': chapter_info['channel']['display_name'],
 212                 'uploader_id': chapter_info['channel']['name'],
 213             }
 214             return [info]
 215         else:
 216             video_id = mobj.group('videoid')
 217             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 218
 219         self.report_extraction(video_id)
 220
 221         info = []
 222         offset = 0
 223         limit = self._JUSTIN_PAGE_LIMIT
 224         while True:
 225             if paged:
 226                 self.report_download_page(video_id, offset)
 227             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 228             page_count, page_info = self._parse_page(page_url, video_id)
 229             info.extend(page_info)
 230             if not paged or page_count != limit:
 231                 break
 232             offset += limit
 233         return info
 234
 235 class FunnyOrDieIE(InfoExtractor):
 236     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 237
 238     def _real_extract(self, url):
 239         mobj = re.match(self._VALID_URL, url)
 240         if mobj is None:
 241             raise ExtractorError(u'invalid URL: %s' % url)
 242
 243         video_id = mobj.group('id')
 244         webpage = self._download_webpage(url, video_id)
 245
 246         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 247             webpage, u'video URL', flags=re.DOTALL)
 248
 249         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 250             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 251
 252         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 253             webpage, u'description', fatal=False, flags=re.DOTALL)
 254
 255         info = {
 256             'id': video_id,
 257             'url': video_url,
 258             'ext': 'mp4',
 259             'title': title,
 260             'description': video_description,
 261         }
 262         return [info]
 263
 264 class SteamIE(InfoExtractor):
 265     _VALID_URL = r"""http://store\.steampowered\.com/
 266                 (agecheck/)?
 267                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 268                 (?P<gameID>\d+)/?
 269                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 270                 """
 271     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 272     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 273
 274     @classmethod
 275     def suitable(cls, url):
 276         """Receives a URL and returns True if suitable for this IE."""
 277         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 278
 279     def _real_extract(self, url):
 280         m = re.match(self._VALID_URL, url, re.VERBOSE)
 281         gameID = m.group('gameID')
 282
 283         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 284         webpage = self._download_webpage(videourl, gameID)
 285
 286         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 287             videourl = self._AGECHECK_TEMPLATE % gameID
 288             self.report_age_confirmation()
 289             webpage = self._download_webpage(videourl, gameID)
 290
 291         self.report_extraction(gameID)
 292         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 293                                              webpage, 'game title')
 294
 295         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 296         mweb = re.finditer(urlRE, webpage)
 297         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 298         titles = re.finditer(namesRE, webpage)
 299         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 300         thumbs = re.finditer(thumbsRE, webpage)
 301         videos = []
 302         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 303             video_id = vid.group('videoID')
 304             title = vtitle.group('videoName')
 305             video_url = vid.group('videoURL')
 306             video_thumb = thumb.group('thumbnail')
 307             if not video_url:
 308                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 309             info = {
 310                 'id':video_id,
 311                 'url':video_url,
 312                 'ext': 'flv',
 313                 'title': unescapeHTML(title),
 314                 'thumbnail': video_thumb
 315                   }
 316             videos.append(info)
 317         return [self.playlist_result(videos, gameID, game_title)]
 318
 319 class UstreamIE(InfoExtractor):
 320     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 321     IE_NAME = u'ustream'
 322
 323     def _real_extract(self, url):
 324         m = re.match(self._VALID_URL, url)
 325         video_id = m.group('videoID')
 326
 327         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 328         webpage = self._download_webpage(url, video_id)
 329
 330         self.report_extraction(video_id)
 331
 332         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 333             webpage, u'title')
 334
 335         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 336             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 337
 338         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 339             webpage, u'thumbnail', fatal=False)
 340
 341         info = {
 342                 'id': video_id,
 343                 'url': video_url,
 344                 'ext': 'flv',
 345                 'title': video_title,
 346                 'uploader': uploader,
 347                 'thumbnail': thumbnail,
 348                }
 349         return info
 350
 351 class WorldStarHipHopIE(InfoExtractor):
 352     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 353     IE_NAME = u'WorldStarHipHop'
 354
 355     def _real_extract(self, url):
 356         m = re.match(self._VALID_URL, url)
 357         video_id = m.group('id')
 358
 359         webpage_src = self._download_webpage(url, video_id)
 360
 361         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 362             webpage_src, u'video URL')
 363
 364         if 'mp4' in video_url:
 365             ext = 'mp4'
 366         else:
 367             ext = 'flv'
 368
 369         video_title = self._html_search_regex(r"<title>(.*)</title>",
 370             webpage_src, u'title')
 371
 372         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 373         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 374             webpage_src, u'thumbnail', fatal=False)
 375
 376         if not thumbnail:
 377             _title = r"""candytitles.*>(.*)</span>"""
 378             mobj = re.search(_title, webpage_src)
 379             if mobj is not None:
 380                 video_title = mobj.group(1)
 381
 382         results = [{
 383                     'id': video_id,
 384                     'url' : video_url,
 385                     'title' : video_title,
 386                     'thumbnail' : thumbnail,
 387                     'ext' : ext,
 388                     }]
 389         return results
 390
 391 class RBMARadioIE(InfoExtractor):
 392     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 393
 394     def _real_extract(self, url):
 395         m = re.match(self._VALID_URL, url)
 396         video_id = m.group('videoID')
 397
 398         webpage = self._download_webpage(url, video_id)
 399
 400         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 401             webpage, u'json data', flags=re.MULTILINE)
 402
 403         try:
 404             data = json.loads(json_data)
 405         except ValueError as e:
 406             raise ExtractorError(u'Invalid JSON: ' + str(e))
 407
 408         video_url = data['akamai_url'] + '&cbr=256'
 409         url_parts = compat_urllib_parse_urlparse(video_url)
 410         video_ext = url_parts.path.rpartition('.')[2]
 411         info = {
 412                 'id': video_id,
 413                 'url': video_url,
 414                 'ext': video_ext,
 415                 'title': data['title'],
 416                 'description': data.get('teaser_text'),
 417                 'location': data.get('country_of_origin'),
 418                 'uploader': data.get('host', {}).get('name'),
 419                 'uploader_id': data.get('host', {}).get('slug'),
 420                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 421                 'duration': data.get('duration'),
 422         }
 423         return [info]
 424
 425
 426 class YouPornIE(InfoExtractor):
 427     """Information extractor for youporn.com."""
 428     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 429
 430     def _print_formats(self, formats):
 431         """Print all available formats"""
 432         print(u'Available formats:')
 433         print(u'ext\t\tformat')
 434         print(u'---------------------------------')
 435         for format in formats:
 436             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 437
 438     def _specific(self, req_format, formats):
 439         for x in formats:
 440             if(x["format"]==req_format):
 441                 return x
 442         return None
 443
 444     def _real_extract(self, url):
 445         mobj = re.match(self._VALID_URL, url)
 446         if mobj is None:
 447             raise ExtractorError(u'Invalid URL: %s' % url)
 448         video_id = mobj.group('videoid')
 449
 450         req = compat_urllib_request.Request(url)
 451         req.add_header('Cookie', 'age_verified=1')
 452         webpage = self._download_webpage(req, video_id)
 453
 454         # Get JSON parameters
 455         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 456         try:
 457             params = json.loads(json_params)
 458         except:
 459             raise ExtractorError(u'Invalid JSON')
 460
 461         self.report_extraction(video_id)
 462         try:
 463             video_title = params['title']
 464             upload_date = unified_strdate(params['release_date_f'])
 465             video_description = params['description']
 466             video_uploader = params['submitted_by']
 467             thumbnail = params['thumbnails'][0]['image']
 468         except KeyError:
 469             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 470
 471         # Get all of the formats available
 472         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 473         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 474             webpage, u'download list').strip()
 475
 476         # Get all of the links from the page
 477         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 478         links = re.findall(LINK_RE, download_list_html)
 479         if(len(links) == 0):
 480             raise ExtractorError(u'ERROR: no known formats available for video')
 481
 482         self.to_screen(u'Links found: %d' % len(links))
 483
 484         formats = []
 485         for link in links:
 486
 487             # A link looks like this:
 488             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 489             # A path looks like this:
 490             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 491             video_url = unescapeHTML( link )
 492             path = compat_urllib_parse_urlparse( video_url ).path
 493             extension = os.path.splitext( path )[1][1:]
 494             format = path.split('/')[4].split('_')[:2]
 495             size = format[0]
 496             bitrate = format[1]
 497             format = "-".join( format )
 498             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 499
 500             formats.append({
 501                 'id': video_id,
 502                 'url': video_url,
 503                 'uploader': video_uploader,
 504                 'upload_date': upload_date,
 505                 'title': video_title,
 506                 'ext': extension,
 507                 'format': format,
 508                 'thumbnail': thumbnail,
 509                 'description': video_description
 510             })
 511
 512         if self._downloader.params.get('listformats', None):
 513             self._print_formats(formats)
 514             return
 515
 516         req_format = self._downloader.params.get('format', None)
 517         self.to_screen(u'Format: %s' % req_format)
 518
 519         if req_format is None or req_format == 'best':
 520             return [formats[0]]
 521         elif req_format == 'worst':
 522             return [formats[-1]]
 523         elif req_format in ('-1', 'all'):
 524             return formats
 525         else:
 526             format = self._specific( req_format, formats )
 527             if result is None:
 528                 raise ExtractorError(u'Requested format not available')
 529             return [format]
 530
 531
 532
 533 class PornotubeIE(InfoExtractor):
 534     """Information extractor for pornotube.com."""
 535     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 536
 537     def _real_extract(self, url):
 538         mobj = re.match(self._VALID_URL, url)
 539         if mobj is None:
 540             raise ExtractorError(u'Invalid URL: %s' % url)
 541
 542         video_id = mobj.group('videoid')
 543         video_title = mobj.group('title')
 544
 545         # Get webpage content
 546         webpage = self._download_webpage(url, video_id)
 547
 548         # Get the video URL
 549         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 550         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 551         video_url = compat_urllib_parse.unquote(video_url)
 552
 553         #Get the uploaded date
 554         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 555         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 556         if upload_date: upload_date = unified_strdate(upload_date)
 557
 558         info = {'id': video_id,
 559                 'url': video_url,
 560                 'uploader': None,
 561                 'upload_date': upload_date,
 562                 'title': video_title,
 563                 'ext': 'flv',
 564                 'format': 'flv'}
 565
 566         return [info]
 567
 568 class YouJizzIE(InfoExtractor):
 569     """Information extractor for youjizz.com."""
 570     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 571
 572     def _real_extract(self, url):
 573         mobj = re.match(self._VALID_URL, url)
 574         if mobj is None:
 575             raise ExtractorError(u'Invalid URL: %s' % url)
 576
 577         video_id = mobj.group('videoid')
 578
 579         # Get webpage content
 580         webpage = self._download_webpage(url, video_id)
 581
 582         # Get the video title
 583         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 584             webpage, u'title').strip()
 585
 586         # Get the embed page
 587         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 588         if result is None:
 589             raise ExtractorError(u'ERROR: unable to extract embed page')
 590
 591         embed_page_url = result.group(0).strip()
 592         video_id = result.group('videoid')
 593
 594         webpage = self._download_webpage(embed_page_url, video_id)
 595
 596         # Get the video URL
 597         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 598             webpage, u'video URL')
 599
 600         info = {'id': video_id,
 601                 'url': video_url,
 602                 'title': video_title,
 603                 'ext': 'flv',
 604                 'format': 'flv',
 605                 'player_url': embed_page_url}
 606
 607         return [info]
 608
 609 class EightTracksIE(InfoExtractor):
 610     IE_NAME = '8tracks'
 611     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 612
 613     def _real_extract(self, url):
 614         mobj = re.match(self._VALID_URL, url)
 615         if mobj is None:
 616             raise ExtractorError(u'Invalid URL: %s' % url)
 617         playlist_id = mobj.group('id')
 618
 619         webpage = self._download_webpage(url, playlist_id)
 620
 621         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 622         data = json.loads(json_like)
 623
 624         session = str(random.randint(0, 1000000000))
 625         mix_id = data['id']
 626         track_count = data['tracks_count']
 627         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 628         next_url = first_url
 629         res = []
 630         for i in itertools.count():
 631             api_json = self._download_webpage(next_url, playlist_id,
 632                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 633                 errnote=u'Failed to download song information')
 634             api_data = json.loads(api_json)
 635             track_data = api_data[u'set']['track']
 636             info = {
 637                 'id': track_data['id'],
 638                 'url': track_data['track_file_stream_url'],
 639                 'title': track_data['performer'] + u' - ' + track_data['name'],
 640                 'raw_title': track_data['name'],
 641                 'uploader_id': data['user']['login'],
 642                 'ext': 'm4a',
 643             }
 644             res.append(info)
 645             if api_data['set']['at_last_track']:
 646                 break
 647             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 648         return res
 649
 650 class KeekIE(InfoExtractor):
 651     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 652     IE_NAME = u'keek'
 653
 654     def _real_extract(self, url):
 655         m = re.match(self._VALID_URL, url)
 656         video_id = m.group('videoID')
 657
 658         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 659         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 660         webpage = self._download_webpage(url, video_id)
 661
 662         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 663             webpage, u'title')
 664
 665         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 666             webpage, u'uploader', fatal=False)
 667
 668         info = {
 669                 'id': video_id,
 670                 'url': video_url,
 671                 'ext': 'mp4',
 672                 'title': video_title,
 673                 'thumbnail': thumbnail,
 674                 'uploader': uploader
 675         }
 676         return [info]
 677
 678
 679 class MySpassIE(InfoExtractor):
 680     _VALID_URL = r'http://www.myspass.de/.*'
 681
 682     def _real_extract(self, url):
 683         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 684
 685         # video id is the last path element of the URL
 686         # usually there is a trailing slash, so also try the second but last
 687         url_path = compat_urllib_parse_urlparse(url).path
 688         url_parent_path, video_id = os.path.split(url_path)
 689         if not video_id:
 690             _, video_id = os.path.split(url_parent_path)
 691
 692         # get metadata
 693         metadata_url = META_DATA_URL_TEMPLATE % video_id
 694         metadata_text = self._download_webpage(metadata_url, video_id)
 695         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 696
 697         # extract values from metadata
 698         url_flv_el = metadata.find('url_flv')
 699         if url_flv_el is None:
 700             raise ExtractorError(u'Unable to extract download url')
 701         video_url = url_flv_el.text
 702         extension = os.path.splitext(video_url)[1][1:]
 703         title_el = metadata.find('title')
 704         if title_el is None:
 705             raise ExtractorError(u'Unable to extract title')
 706         title = title_el.text
 707         format_id_el = metadata.find('format_id')
 708         if format_id_el is None:
 709             format = ext
 710         else:
 711             format = format_id_el.text
 712         description_el = metadata.find('description')
 713         if description_el is not None:
 714             description = description_el.text
 715         else:
 716             description = None
 717         imagePreview_el = metadata.find('imagePreview')
 718         if imagePreview_el is not None:
 719             thumbnail = imagePreview_el.text
 720         else:
 721             thumbnail = None
 722         info = {
 723             'id': video_id,
 724             'url': video_url,
 725             'title': title,
 726             'ext': extension,
 727             'format': format,
 728             'thumbnail': thumbnail,
 729             'description': description
 730         }
 731         return [info]
 732
 733 class SpiegelIE(InfoExtractor):
 734     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 735
 736     def _real_extract(self, url):
 737         m = re.match(self._VALID_URL, url)
 738         video_id = m.group('videoID')
 739
 740         webpage = self._download_webpage(url, video_id)
 741
 742         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 743             webpage, u'title')
 744
 745         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 746         xml_code = self._download_webpage(xml_url, video_id,
 747                     note=u'Downloading XML', errnote=u'Failed to download XML')
 748
 749         idoc = xml.etree.ElementTree.fromstring(xml_code)
 750         last_type = idoc[-1]
 751         filename = last_type.findall('./filename')[0].text
 752         duration = float(last_type.findall('./duration')[0].text)
 753
 754         video_url = 'http://video2.spiegel.de/flash/' + filename
 755         video_ext = filename.rpartition('.')[2]
 756         info = {
 757             'id': video_id,
 758             'url': video_url,
 759             'ext': video_ext,
 760             'title': video_title,
 761             'duration': duration,
 762         }
 763         return [info]
 764
 765 class LiveLeakIE(InfoExtractor):
 766
 767     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 768     IE_NAME = u'liveleak'
 769
 770     def _real_extract(self, url):
 771         mobj = re.match(self._VALID_URL, url)
 772         if mobj is None:
 773             raise ExtractorError(u'Invalid URL: %s' % url)
 774
 775         video_id = mobj.group('video_id')
 776
 777         webpage = self._download_webpage(url, video_id)
 778
 779         video_url = self._search_regex(r'file: "(.*?)",',
 780             webpage, u'video URL')
 781
 782         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 783             webpage, u'title').replace('LiveLeak.com -', '').strip()
 784
 785         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 786             webpage, u'description', fatal=False)
 787
 788         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 789             webpage, u'uploader', fatal=False)
 790
 791         info = {
 792             'id':  video_id,
 793             'url': video_url,
 794             'ext': 'mp4',
 795             'title': video_title,
 796             'description': video_description,
 797             'uploader': video_uploader
 798         }
 799
 800         return [info]
 801
 802
 803
 804 class TumblrIE(InfoExtractor):
 805     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 806
 807     def _real_extract(self, url):
 808         m_url = re.match(self._VALID_URL, url)
 809         video_id = m_url.group('id')
 810         blog = m_url.group('blog_name')
 811
 812         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 813         webpage = self._download_webpage(url, video_id)
 814
 815         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 816         video = re.search(re_video, webpage)
 817         if video is None:
 818            raise ExtractorError(u'Unable to extract video')
 819         video_url = video.group('video_url')
 820         ext = video.group('ext')
 821
 822         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 823             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 824         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 825
 826         # The only place where you can get a title, it's not complete,
 827         # but searching in other places doesn't work for all videos
 828         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 829             webpage, u'title', flags=re.DOTALL)
 830
 831         return [{'id': video_id,
 832                  'url': video_url,
 833                  'title': video_title,
 834                  'thumbnail': video_thumbnail,
 835                  'ext': ext
 836                  }]
 837
 838 class BandcampIE(InfoExtractor):
 839     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 840
 841     def _real_extract(self, url):
 842         mobj = re.match(self._VALID_URL, url)
 843         title = mobj.group('title')
 844         webpage = self._download_webpage(url, title)
 845         # We get the link to the free download page
 846         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 847         if m_download is None:
 848             raise ExtractorError(u'No free songs found')
 849
 850         download_link = m_download.group(1)
 851         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 852                        webpage, re.MULTILINE|re.DOTALL).group('id')
 853
 854         download_webpage = self._download_webpage(download_link, id,
 855                                                   'Downloading free downloads page')
 856         # We get the dictionary of the track from some javascrip code
 857         info = re.search(r'items: (.*?),$',
 858                          download_webpage, re.MULTILINE).group(1)
 859         info = json.loads(info)[0]
 860         # We pick mp3-320 for now, until format selection can be easily implemented.
 861         mp3_info = info[u'downloads'][u'mp3-320']
 862         # If we try to use this url it says the link has expired
 863         initial_url = mp3_info[u'url']
 864         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 865         m_url = re.match(re_url, initial_url)
 866         #We build the url we will use to get the final track url
 867         # This url is build in Bandcamp in the script download_bunde_*.js
 868         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 869         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 870         # If we could correctly generate the .rand field the url would be
 871         #in the "download_url" key
 872         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 873
 874         track_info = {'id':id,
 875                       'title' : info[u'title'],
 876                       'ext' :   'mp3',
 877                       'url' :   final_url,
 878                       'thumbnail' : info[u'thumb_url'],
 879                       'uploader' :  info[u'artist']
 880                       }
 881
 882         return [track_info]
 883
 884 class RedTubeIE(InfoExtractor):
 885     """Information Extractor for redtube"""
 886     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 887
 888     def _real_extract(self,url):
 889         mobj = re.match(self._VALID_URL, url)
 890         if mobj is None:
 891             raise ExtractorError(u'Invalid URL: %s' % url)
 892
 893         video_id = mobj.group('id')
 894         video_extension = 'mp4'
 895         webpage = self._download_webpage(url, video_id)
 896
 897         self.report_extraction(video_id)
 898
 899         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 900             webpage, u'video URL')
 901
 902         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 903             webpage, u'title')
 904
 905         return [{
 906             'id':       video_id,
 907             'url':      video_url,
 908             'ext':      video_extension,
 909             'title':    video_title,
 910         }]
 911
 912 class InaIE(InfoExtractor):
 913     """Information Extractor for Ina.fr"""
 914     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 915
 916     def _real_extract(self,url):
 917         mobj = re.match(self._VALID_URL, url)
 918
 919         video_id = mobj.group('id')
 920         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 921         video_extension = 'mp4'
 922         webpage = self._download_webpage(mrss_url, video_id)
 923
 924         self.report_extraction(video_id)
 925
 926         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 927             webpage, u'video URL')
 928
 929         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 930             webpage, u'title')
 931
 932         return [{
 933             'id':       video_id,
 934             'url':      video_url,
 935             'ext':      video_extension,
 936             'title':    video_title,
 937         }]
 938
 939 class HowcastIE(InfoExtractor):
 940     """Information Extractor for Howcast.com"""
 941     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 942
 943     def _real_extract(self, url):
 944         mobj = re.match(self._VALID_URL, url)
 945
 946         video_id = mobj.group('id')
 947         webpage_url = 'http://www.howcast.com/videos/' + video_id
 948         webpage = self._download_webpage(webpage_url, video_id)
 949
 950         self.report_extraction(video_id)
 951
 952         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 953             webpage, u'video URL')
 954
 955         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 956             webpage, u'title')
 957
 958         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 959             webpage, u'description', fatal=False)
 960
 961         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 962             webpage, u'thumbnail', fatal=False)
 963
 964         return [{
 965             'id':       video_id,
 966             'url':      video_url,
 967             'ext':      'mp4',
 968             'title':    video_title,
 969             'description': video_description,
 970             'thumbnail': thumbnail,
 971         }]
 972
 973 class VineIE(InfoExtractor):
 974     """Information Extractor for Vine.co"""
 975     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 976
 977     def _real_extract(self, url):
 978         mobj = re.match(self._VALID_URL, url)
 979
 980         video_id = mobj.group('id')
 981         webpage_url = 'https://vine.co/v/' + video_id
 982         webpage = self._download_webpage(webpage_url, video_id)
 983
 984         self.report_extraction(video_id)
 985
 986         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 987             webpage, u'video URL')
 988
 989         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 990             webpage, u'title')
 991
 992         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 993             webpage, u'thumbnail', fatal=False)
 994
 995         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 996             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 997
 998         return [{
 999             'id':        video_id,
1000             'url':       video_url,
1001             'ext':       'mp4',
1002             'title':     video_title,
1003             'thumbnail': thumbnail,
1004             'uploader':  uploader,
1005         }]
1006
1007 class FlickrIE(InfoExtractor):
1008     """Information Extractor for Flickr videos"""
1009     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1010
1011     def _real_extract(self, url):
1012         mobj = re.match(self._VALID_URL, url)
1013
1014         video_id = mobj.group('id')
1015         video_uploader_id = mobj.group('uploader_id')
1016         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1017         webpage = self._download_webpage(webpage_url, video_id)
1018
1019         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1020
1021         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1022         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1023
1024         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1025             first_xml, u'node_id')
1026
1027         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1028         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1029
1030         self.report_extraction(video_id)
1031
1032         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1033         if mobj is None:
1034             raise ExtractorError(u'Unable to extract video url')
1035         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1036
1037         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1038             webpage, u'video title')
1039
1040         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1041             webpage, u'description', fatal=False)
1042
1043         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1044             webpage, u'thumbnail', fatal=False)
1045
1046         return [{
1047             'id':          video_id,
1048             'url':         video_url,
1049             'ext':         'mp4',
1050             'title':       video_title,
1051             'description': video_description,
1052             'thumbnail':   thumbnail,
1053             'uploader_id': video_uploader_id,
1054         }]
1055
1056 class TeamcocoIE(InfoExtractor):
1057     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1058
1059     def _real_extract(self, url):
1060         mobj = re.match(self._VALID_URL, url)
1061         if mobj is None:
1062             raise ExtractorError(u'Invalid URL: %s' % url)
1063         url_title = mobj.group('url_title')
1064         webpage = self._download_webpage(url, url_title)
1065
1066         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1067             webpage, u'video id')
1068
1069         self.report_extraction(video_id)
1070
1071         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1072             webpage, u'title')
1073
1074         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1075             webpage, u'thumbnail', fatal=False)
1076
1077         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1078             webpage, u'description', fatal=False)
1079
1080         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1081         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1082
1083         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1084             data, u'video URL')
1085
1086         return [{
1087             'id':          video_id,
1088             'url':         video_url,
1089             'ext':         'mp4',
1090             'title':       video_title,
1091             'thumbnail':   thumbnail,
1092             'description': video_description,
1093         }]
1094
1095 class XHamsterIE(InfoExtractor):
1096     """Information Extractor for xHamster"""
1097     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1098
1099     def _real_extract(self,url):
1100         mobj = re.match(self._VALID_URL, url)
1101
1102         video_id = mobj.group('id')
1103         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1104         webpage = self._download_webpage(mrss_url, video_id)
1105
1106         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1107         if mobj is None:
1108             raise ExtractorError(u'Unable to extract media URL')
1109         if len(mobj.group('server')) == 0:
1110             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1111         else:
1112             video_url = mobj.group('server')+'/key='+mobj.group('file')
1113         video_extension = video_url.split('.')[-1]
1114
1115         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1116             webpage, u'title')
1117
1118         # Can't see the description anywhere in the UI
1119         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1120         #     webpage, u'description', fatal=False)
1121         # if video_description: video_description = unescapeHTML(video_description)
1122
1123         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1124         if mobj:
1125             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1126         else:
1127             video_upload_date = None
1128             self._downloader.report_warning(u'Unable to extract upload date')
1129
1130         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1131             webpage, u'uploader id', default=u'anonymous')
1132
1133         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1134             webpage, u'thumbnail', fatal=False)
1135
1136         return [{
1137             'id':       video_id,
1138             'url':      video_url,
1139             'ext':      video_extension,
1140             'title':    video_title,
1141             # 'description': video_description,
1142             'upload_date': video_upload_date,
1143             'uploader_id': video_uploader_id,
1144             'thumbnail': video_thumbnail
1145         }]
1146
1147 class HypemIE(InfoExtractor):
1148     """Information Extractor for hypem"""
1149     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1150
1151     def _real_extract(self, url):
1152         mobj = re.match(self._VALID_URL, url)
1153         if mobj is None:
1154             raise ExtractorError(u'Invalid URL: %s' % url)
1155         track_id = mobj.group(1)
1156
1157         data = { 'ax': 1, 'ts': time.time() }
1158         data_encoded = compat_urllib_parse.urlencode(data)
1159         complete_url = url + "?" + data_encoded
1160         request = compat_urllib_request.Request(complete_url)
1161         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1162         cookie = urlh.headers.get('Set-Cookie', '')
1163
1164         self.report_extraction(track_id)
1165
1166         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1167             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1168         try:
1169             track_list = json.loads(html_tracks)
1170             track = track_list[u'tracks'][0]
1171         except ValueError:
1172             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1173
1174         key = track[u"key"]
1175         track_id = track[u"id"]
1176         artist = track[u"artist"]
1177         title = track[u"song"]
1178
1179         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1180         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1181         request.add_header('cookie', cookie)
1182         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1183         try:
1184             song_data = json.loads(song_data_json)
1185         except ValueError:
1186             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1187         final_url = song_data[u"url"]
1188
1189         return [{
1190             'id':       track_id,
1191             'url':      final_url,
1192             'ext':      "mp3",
1193             'title':    title,
1194             'artist':   artist,
1195         }]
1196
1197 class Vbox7IE(InfoExtractor):
1198     """Information Extractor for Vbox7"""
1199     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1200
1201     def _real_extract(self,url):
1202         mobj = re.match(self._VALID_URL, url)
1203         if mobj is None:
1204             raise ExtractorError(u'Invalid URL: %s' % url)
1205         video_id = mobj.group(1)
1206
1207         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1208         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1209         redirect_url = urlh.geturl() + new_location
1210         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1211
1212         title = self._html_search_regex(r'<title>(.*)</title>',
1213             webpage, u'title').split('/')[0].strip()
1214
1215         ext = "flv"
1216         info_url = "http://vbox7.com/play/magare.do"
1217         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1218         info_request = compat_urllib_request.Request(info_url, data)
1219         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1220         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1221         if info_response is None:
1222             raise ExtractorError(u'Unable to extract the media url')
1223         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1224
1225         return [{
1226             'id':        video_id,
1227             'url':       final_url,
1228             'ext':       ext,
1229             'title':     title,
1230             'thumbnail': thumbnail_url,
1231         }]
1232
1233
1234 def gen_extractors():
1235     """ Return a list of an instance of every supported extractor.
1236     The order does matter; the first extractor matched is the one handling the URL.
1237     """
1238     return [
1239         YoutubePlaylistIE(),
1240         YoutubeChannelIE(),
1241         YoutubeUserIE(),
1242         YoutubeSearchIE(),
1243         YoutubeIE(),
1244         MetacafeIE(),
1245         DailymotionIE(),
1246         GoogleSearchIE(),
1247         PhotobucketIE(),
1248         YahooIE(),
1249         YahooSearchIE(),
1250         DepositFilesIE(),
1251         FacebookIE(),
1252         BlipTVIE(),
1253         BlipTVUserIE(),
1254         VimeoIE(),
1255         MyVideoIE(),
1256         ComedyCentralIE(),
1257         EscapistIE(),
1258         CollegeHumorIE(),
1259         XVideosIE(),
1260         SoundcloudSetIE(),
1261         SoundcloudIE(),
1262         InfoQIE(),
1263         MixcloudIE(),
1264         StanfordOpenClassroomIE(),
1265         MTVIE(),
1266         YoukuIE(),
1267         XNXXIE(),
1268         YouJizzIE(),
1269         PornotubeIE(),
1270         YouPornIE(),
1271         GooglePlusIE(),
1272         ArteTvIE(),
1273         NBAIE(),
1274         WorldStarHipHopIE(),
1275         JustinTVIE(),
1276         FunnyOrDieIE(),
1277         SteamIE(),
1278         UstreamIE(),
1279         RBMARadioIE(),
1280         EightTracksIE(),
1281         KeekIE(),
1282         TEDIE(),
1283         MySpassIE(),
1284         SpiegelIE(),
1285         LiveLeakIE(),
1286         ARDIE(),
1287         ZDFIE(),
1288         TumblrIE(),
1289         BandcampIE(),
1290         RedTubeIE(),
1291         InaIE(),
1292         HowcastIE(),
1293         VineIE(),
1294         FlickrIE(),
1295         TeamcocoIE(),
1296         XHamsterIE(),
1297         HypemIE(),
1298         Vbox7IE(),
1299         GametrailersIE(),
1300         StatigramIE(),
1301         GenericIE()
1302     ]
1303
1304 def get_info_extractor(ie_name):
1305     """Returns the info extractor class with the given ie_name"""
1306     return globals()[ie_name+'IE']