youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.justintv import JustinTVIE
  37 from .extractor.metacafe import MetacafeIE
  38 from .extractor.mixcloud import MixcloudIE
  39 from .extractor.mtv import MTVIE
  40 from .extractor.myvideo import MyVideoIE
  41 from .extractor.nba import NBAIE
  42 from .extractor.statigram import StatigramIE
  43 from .extractor.photobucket import PhotobucketIE
  44 from .extractor.pornotube import PornotubeIE
  45 from .extractor.rbmaradio import RBMARadioIE
  46 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  47 from .extractor.stanfordoc import StanfordOpenClassroomIE
  48 from .extractor.steam import SteamIE
  49 from .extractor.ted import TEDIE
  50 from .extractor.ustream import UstreamIE
  51 from .extractor.vimeo import VimeoIE
  52 from .extractor.worldstarhiphop import WorldStarHipHopIE
  53 from .extractor.xnxx import XNXXIE
  54 from .extractor.xvideos import XVideosIE
  55 from .extractor.yahoo import YahooIE, YahooSearchIE
  56 from .extractor.youku import YoukuIE
  57 from .extractor.youporn import YouPornIE
  58 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  59 from .extractor.zdf import ZDFIE
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84 class YouJizzIE(InfoExtractor):
  85     """Information extractor for youjizz.com."""
  86     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
  87
  88     def _real_extract(self, url):
  89         mobj = re.match(self._VALID_URL, url)
  90         if mobj is None:
  91             raise ExtractorError(u'Invalid URL: %s' % url)
  92
  93         video_id = mobj.group('videoid')
  94
  95         # Get webpage content
  96         webpage = self._download_webpage(url, video_id)
  97
  98         # Get the video title
  99         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 100             webpage, u'title').strip()
 101
 102         # Get the embed page
 103         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 104         if result is None:
 105             raise ExtractorError(u'ERROR: unable to extract embed page')
 106
 107         embed_page_url = result.group(0).strip()
 108         video_id = result.group('videoid')
 109
 110         webpage = self._download_webpage(embed_page_url, video_id)
 111
 112         # Get the video URL
 113         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 114             webpage, u'video URL')
 115
 116         info = {'id': video_id,
 117                 'url': video_url,
 118                 'title': video_title,
 119                 'ext': 'flv',
 120                 'format': 'flv',
 121                 'player_url': embed_page_url}
 122
 123         return [info]
 124
 125 class EightTracksIE(InfoExtractor):
 126     IE_NAME = '8tracks'
 127     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 128
 129     def _real_extract(self, url):
 130         mobj = re.match(self._VALID_URL, url)
 131         if mobj is None:
 132             raise ExtractorError(u'Invalid URL: %s' % url)
 133         playlist_id = mobj.group('id')
 134
 135         webpage = self._download_webpage(url, playlist_id)
 136
 137         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 138         data = json.loads(json_like)
 139
 140         session = str(random.randint(0, 1000000000))
 141         mix_id = data['id']
 142         track_count = data['tracks_count']
 143         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 144         next_url = first_url
 145         res = []
 146         for i in itertools.count():
 147             api_json = self._download_webpage(next_url, playlist_id,
 148                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 149                 errnote=u'Failed to download song information')
 150             api_data = json.loads(api_json)
 151             track_data = api_data[u'set']['track']
 152             info = {
 153                 'id': track_data['id'],
 154                 'url': track_data['track_file_stream_url'],
 155                 'title': track_data['performer'] + u' - ' + track_data['name'],
 156                 'raw_title': track_data['name'],
 157                 'uploader_id': data['user']['login'],
 158                 'ext': 'm4a',
 159             }
 160             res.append(info)
 161             if api_data['set']['at_last_track']:
 162                 break
 163             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 164         return res
 165
 166 class KeekIE(InfoExtractor):
 167     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 168     IE_NAME = u'keek'
 169
 170     def _real_extract(self, url):
 171         m = re.match(self._VALID_URL, url)
 172         video_id = m.group('videoID')
 173
 174         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 175         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 176         webpage = self._download_webpage(url, video_id)
 177
 178         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 179             webpage, u'title')
 180
 181         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 182             webpage, u'uploader', fatal=False)
 183
 184         info = {
 185                 'id': video_id,
 186                 'url': video_url,
 187                 'ext': 'mp4',
 188                 'title': video_title,
 189                 'thumbnail': thumbnail,
 190                 'uploader': uploader
 191         }
 192         return [info]
 193
 194
 195 class MySpassIE(InfoExtractor):
 196     _VALID_URL = r'http://www.myspass.de/.*'
 197
 198     def _real_extract(self, url):
 199         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 200
 201         # video id is the last path element of the URL
 202         # usually there is a trailing slash, so also try the second but last
 203         url_path = compat_urllib_parse_urlparse(url).path
 204         url_parent_path, video_id = os.path.split(url_path)
 205         if not video_id:
 206             _, video_id = os.path.split(url_parent_path)
 207
 208         # get metadata
 209         metadata_url = META_DATA_URL_TEMPLATE % video_id
 210         metadata_text = self._download_webpage(metadata_url, video_id)
 211         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 212
 213         # extract values from metadata
 214         url_flv_el = metadata.find('url_flv')
 215         if url_flv_el is None:
 216             raise ExtractorError(u'Unable to extract download url')
 217         video_url = url_flv_el.text
 218         extension = os.path.splitext(video_url)[1][1:]
 219         title_el = metadata.find('title')
 220         if title_el is None:
 221             raise ExtractorError(u'Unable to extract title')
 222         title = title_el.text
 223         format_id_el = metadata.find('format_id')
 224         if format_id_el is None:
 225             format = ext
 226         else:
 227             format = format_id_el.text
 228         description_el = metadata.find('description')
 229         if description_el is not None:
 230             description = description_el.text
 231         else:
 232             description = None
 233         imagePreview_el = metadata.find('imagePreview')
 234         if imagePreview_el is not None:
 235             thumbnail = imagePreview_el.text
 236         else:
 237             thumbnail = None
 238         info = {
 239             'id': video_id,
 240             'url': video_url,
 241             'title': title,
 242             'ext': extension,
 243             'format': format,
 244             'thumbnail': thumbnail,
 245             'description': description
 246         }
 247         return [info]
 248
 249 class SpiegelIE(InfoExtractor):
 250     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 251
 252     def _real_extract(self, url):
 253         m = re.match(self._VALID_URL, url)
 254         video_id = m.group('videoID')
 255
 256         webpage = self._download_webpage(url, video_id)
 257
 258         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 259             webpage, u'title')
 260
 261         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 262         xml_code = self._download_webpage(xml_url, video_id,
 263                     note=u'Downloading XML', errnote=u'Failed to download XML')
 264
 265         idoc = xml.etree.ElementTree.fromstring(xml_code)
 266         last_type = idoc[-1]
 267         filename = last_type.findall('./filename')[0].text
 268         duration = float(last_type.findall('./duration')[0].text)
 269
 270         video_url = 'http://video2.spiegel.de/flash/' + filename
 271         video_ext = filename.rpartition('.')[2]
 272         info = {
 273             'id': video_id,
 274             'url': video_url,
 275             'ext': video_ext,
 276             'title': video_title,
 277             'duration': duration,
 278         }
 279         return [info]
 280
 281 class LiveLeakIE(InfoExtractor):
 282
 283     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 284     IE_NAME = u'liveleak'
 285
 286     def _real_extract(self, url):
 287         mobj = re.match(self._VALID_URL, url)
 288         if mobj is None:
 289             raise ExtractorError(u'Invalid URL: %s' % url)
 290
 291         video_id = mobj.group('video_id')
 292
 293         webpage = self._download_webpage(url, video_id)
 294
 295         video_url = self._search_regex(r'file: "(.*?)",',
 296             webpage, u'video URL')
 297
 298         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 299             webpage, u'title').replace('LiveLeak.com -', '').strip()
 300
 301         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 302             webpage, u'description', fatal=False)
 303
 304         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 305             webpage, u'uploader', fatal=False)
 306
 307         info = {
 308             'id':  video_id,
 309             'url': video_url,
 310             'ext': 'mp4',
 311             'title': video_title,
 312             'description': video_description,
 313             'uploader': video_uploader
 314         }
 315
 316         return [info]
 317
 318
 319
 320 class TumblrIE(InfoExtractor):
 321     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 322
 323     def _real_extract(self, url):
 324         m_url = re.match(self._VALID_URL, url)
 325         video_id = m_url.group('id')
 326         blog = m_url.group('blog_name')
 327
 328         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 329         webpage = self._download_webpage(url, video_id)
 330
 331         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 332         video = re.search(re_video, webpage)
 333         if video is None:
 334            raise ExtractorError(u'Unable to extract video')
 335         video_url = video.group('video_url')
 336         ext = video.group('ext')
 337
 338         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 339             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 340         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 341
 342         # The only place where you can get a title, it's not complete,
 343         # but searching in other places doesn't work for all videos
 344         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 345             webpage, u'title', flags=re.DOTALL)
 346
 347         return [{'id': video_id,
 348                  'url': video_url,
 349                  'title': video_title,
 350                  'thumbnail': video_thumbnail,
 351                  'ext': ext
 352                  }]
 353
 354 class BandcampIE(InfoExtractor):
 355     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 356
 357     def _real_extract(self, url):
 358         mobj = re.match(self._VALID_URL, url)
 359         title = mobj.group('title')
 360         webpage = self._download_webpage(url, title)
 361         # We get the link to the free download page
 362         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 363         if m_download is None:
 364             raise ExtractorError(u'No free songs found')
 365
 366         download_link = m_download.group(1)
 367         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 368                        webpage, re.MULTILINE|re.DOTALL).group('id')
 369
 370         download_webpage = self._download_webpage(download_link, id,
 371                                                   'Downloading free downloads page')
 372         # We get the dictionary of the track from some javascrip code
 373         info = re.search(r'items: (.*?),$',
 374                          download_webpage, re.MULTILINE).group(1)
 375         info = json.loads(info)[0]
 376         # We pick mp3-320 for now, until format selection can be easily implemented.
 377         mp3_info = info[u'downloads'][u'mp3-320']
 378         # If we try to use this url it says the link has expired
 379         initial_url = mp3_info[u'url']
 380         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 381         m_url = re.match(re_url, initial_url)
 382         #We build the url we will use to get the final track url
 383         # This url is build in Bandcamp in the script download_bunde_*.js
 384         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 385         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 386         # If we could correctly generate the .rand field the url would be
 387         #in the "download_url" key
 388         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 389
 390         track_info = {'id':id,
 391                       'title' : info[u'title'],
 392                       'ext' :   'mp3',
 393                       'url' :   final_url,
 394                       'thumbnail' : info[u'thumb_url'],
 395                       'uploader' :  info[u'artist']
 396                       }
 397
 398         return [track_info]
 399
 400 class RedTubeIE(InfoExtractor):
 401     """Information Extractor for redtube"""
 402     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 403
 404     def _real_extract(self,url):
 405         mobj = re.match(self._VALID_URL, url)
 406         if mobj is None:
 407             raise ExtractorError(u'Invalid URL: %s' % url)
 408
 409         video_id = mobj.group('id')
 410         video_extension = 'mp4'
 411         webpage = self._download_webpage(url, video_id)
 412
 413         self.report_extraction(video_id)
 414
 415         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 416             webpage, u'video URL')
 417
 418         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 419             webpage, u'title')
 420
 421         return [{
 422             'id':       video_id,
 423             'url':      video_url,
 424             'ext':      video_extension,
 425             'title':    video_title,
 426         }]
 427
 428 class InaIE(InfoExtractor):
 429     """Information Extractor for Ina.fr"""
 430     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 431
 432     def _real_extract(self,url):
 433         mobj = re.match(self._VALID_URL, url)
 434
 435         video_id = mobj.group('id')
 436         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 437         video_extension = 'mp4'
 438         webpage = self._download_webpage(mrss_url, video_id)
 439
 440         self.report_extraction(video_id)
 441
 442         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 443             webpage, u'video URL')
 444
 445         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 446             webpage, u'title')
 447
 448         return [{
 449             'id':       video_id,
 450             'url':      video_url,
 451             'ext':      video_extension,
 452             'title':    video_title,
 453         }]
 454
 455 class HowcastIE(InfoExtractor):
 456     """Information Extractor for Howcast.com"""
 457     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 458
 459     def _real_extract(self, url):
 460         mobj = re.match(self._VALID_URL, url)
 461
 462         video_id = mobj.group('id')
 463         webpage_url = 'http://www.howcast.com/videos/' + video_id
 464         webpage = self._download_webpage(webpage_url, video_id)
 465
 466         self.report_extraction(video_id)
 467
 468         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 469             webpage, u'video URL')
 470
 471         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 472             webpage, u'title')
 473
 474         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 475             webpage, u'description', fatal=False)
 476
 477         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 478             webpage, u'thumbnail', fatal=False)
 479
 480         return [{
 481             'id':       video_id,
 482             'url':      video_url,
 483             'ext':      'mp4',
 484             'title':    video_title,
 485             'description': video_description,
 486             'thumbnail': thumbnail,
 487         }]
 488
 489 class VineIE(InfoExtractor):
 490     """Information Extractor for Vine.co"""
 491     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 492
 493     def _real_extract(self, url):
 494         mobj = re.match(self._VALID_URL, url)
 495
 496         video_id = mobj.group('id')
 497         webpage_url = 'https://vine.co/v/' + video_id
 498         webpage = self._download_webpage(webpage_url, video_id)
 499
 500         self.report_extraction(video_id)
 501
 502         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 503             webpage, u'video URL')
 504
 505         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 506             webpage, u'title')
 507
 508         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 509             webpage, u'thumbnail', fatal=False)
 510
 511         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 512             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 513
 514         return [{
 515             'id':        video_id,
 516             'url':       video_url,
 517             'ext':       'mp4',
 518             'title':     video_title,
 519             'thumbnail': thumbnail,
 520             'uploader':  uploader,
 521         }]
 522
 523 class FlickrIE(InfoExtractor):
 524     """Information Extractor for Flickr videos"""
 525     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 526
 527     def _real_extract(self, url):
 528         mobj = re.match(self._VALID_URL, url)
 529
 530         video_id = mobj.group('id')
 531         video_uploader_id = mobj.group('uploader_id')
 532         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 533         webpage = self._download_webpage(webpage_url, video_id)
 534
 535         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 536
 537         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 538         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 539
 540         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 541             first_xml, u'node_id')
 542
 543         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 544         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 545
 546         self.report_extraction(video_id)
 547
 548         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 549         if mobj is None:
 550             raise ExtractorError(u'Unable to extract video url')
 551         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 552
 553         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 554             webpage, u'video title')
 555
 556         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 557             webpage, u'description', fatal=False)
 558
 559         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 560             webpage, u'thumbnail', fatal=False)
 561
 562         return [{
 563             'id':          video_id,
 564             'url':         video_url,
 565             'ext':         'mp4',
 566             'title':       video_title,
 567             'description': video_description,
 568             'thumbnail':   thumbnail,
 569             'uploader_id': video_uploader_id,
 570         }]
 571
 572 class TeamcocoIE(InfoExtractor):
 573     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 574
 575     def _real_extract(self, url):
 576         mobj = re.match(self._VALID_URL, url)
 577         if mobj is None:
 578             raise ExtractorError(u'Invalid URL: %s' % url)
 579         url_title = mobj.group('url_title')
 580         webpage = self._download_webpage(url, url_title)
 581
 582         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 583             webpage, u'video id')
 584
 585         self.report_extraction(video_id)
 586
 587         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 588             webpage, u'title')
 589
 590         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 591             webpage, u'thumbnail', fatal=False)
 592
 593         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 594             webpage, u'description', fatal=False)
 595
 596         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 597         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 598
 599         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 600             data, u'video URL')
 601
 602         return [{
 603             'id':          video_id,
 604             'url':         video_url,
 605             'ext':         'mp4',
 606             'title':       video_title,
 607             'thumbnail':   thumbnail,
 608             'description': video_description,
 609         }]
 610
 611 class XHamsterIE(InfoExtractor):
 612     """Information Extractor for xHamster"""
 613     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 614
 615     def _real_extract(self,url):
 616         mobj = re.match(self._VALID_URL, url)
 617
 618         video_id = mobj.group('id')
 619         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 620         webpage = self._download_webpage(mrss_url, video_id)
 621
 622         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 623         if mobj is None:
 624             raise ExtractorError(u'Unable to extract media URL')
 625         if len(mobj.group('server')) == 0:
 626             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 627         else:
 628             video_url = mobj.group('server')+'/key='+mobj.group('file')
 629         video_extension = video_url.split('.')[-1]
 630
 631         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 632             webpage, u'title')
 633
 634         # Can't see the description anywhere in the UI
 635         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 636         #     webpage, u'description', fatal=False)
 637         # if video_description: video_description = unescapeHTML(video_description)
 638
 639         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 640         if mobj:
 641             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 642         else:
 643             video_upload_date = None
 644             self._downloader.report_warning(u'Unable to extract upload date')
 645
 646         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 647             webpage, u'uploader id', default=u'anonymous')
 648
 649         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 650             webpage, u'thumbnail', fatal=False)
 651
 652         return [{
 653             'id':       video_id,
 654             'url':      video_url,
 655             'ext':      video_extension,
 656             'title':    video_title,
 657             # 'description': video_description,
 658             'upload_date': video_upload_date,
 659             'uploader_id': video_uploader_id,
 660             'thumbnail': video_thumbnail
 661         }]
 662
 663 class HypemIE(InfoExtractor):
 664     """Information Extractor for hypem"""
 665     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 666
 667     def _real_extract(self, url):
 668         mobj = re.match(self._VALID_URL, url)
 669         if mobj is None:
 670             raise ExtractorError(u'Invalid URL: %s' % url)
 671         track_id = mobj.group(1)
 672
 673         data = { 'ax': 1, 'ts': time.time() }
 674         data_encoded = compat_urllib_parse.urlencode(data)
 675         complete_url = url + "?" + data_encoded
 676         request = compat_urllib_request.Request(complete_url)
 677         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 678         cookie = urlh.headers.get('Set-Cookie', '')
 679
 680         self.report_extraction(track_id)
 681
 682         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 683             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 684         try:
 685             track_list = json.loads(html_tracks)
 686             track = track_list[u'tracks'][0]
 687         except ValueError:
 688             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 689
 690         key = track[u"key"]
 691         track_id = track[u"id"]
 692         artist = track[u"artist"]
 693         title = track[u"song"]
 694
 695         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 696         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 697         request.add_header('cookie', cookie)
 698         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 699         try:
 700             song_data = json.loads(song_data_json)
 701         except ValueError:
 702             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 703         final_url = song_data[u"url"]
 704
 705         return [{
 706             'id':       track_id,
 707             'url':      final_url,
 708             'ext':      "mp3",
 709             'title':    title,
 710             'artist':   artist,
 711         }]
 712
 713 class Vbox7IE(InfoExtractor):
 714     """Information Extractor for Vbox7"""
 715     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 716
 717     def _real_extract(self,url):
 718         mobj = re.match(self._VALID_URL, url)
 719         if mobj is None:
 720             raise ExtractorError(u'Invalid URL: %s' % url)
 721         video_id = mobj.group(1)
 722
 723         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 724         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 725         redirect_url = urlh.geturl() + new_location
 726         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 727
 728         title = self._html_search_regex(r'<title>(.*)</title>',
 729             webpage, u'title').split('/')[0].strip()
 730
 731         ext = "flv"
 732         info_url = "http://vbox7.com/play/magare.do"
 733         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 734         info_request = compat_urllib_request.Request(info_url, data)
 735         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 736         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 737         if info_response is None:
 738             raise ExtractorError(u'Unable to extract the media url')
 739         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 740
 741         return [{
 742             'id':        video_id,
 743             'url':       final_url,
 744             'ext':       ext,
 745             'title':     title,
 746             'thumbnail': thumbnail_url,
 747         }]
 748
 749
 750 def gen_extractors():
 751     """ Return a list of an instance of every supported extractor.
 752     The order does matter; the first extractor matched is the one handling the URL.
 753     """
 754     return [
 755         YoutubePlaylistIE(),
 756         YoutubeChannelIE(),
 757         YoutubeUserIE(),
 758         YoutubeSearchIE(),
 759         YoutubeIE(),
 760         MetacafeIE(),
 761         DailymotionIE(),
 762         GoogleSearchIE(),
 763         PhotobucketIE(),
 764         YahooIE(),
 765         YahooSearchIE(),
 766         DepositFilesIE(),
 767         FacebookIE(),
 768         BlipTVIE(),
 769         BlipTVUserIE(),
 770         VimeoIE(),
 771         MyVideoIE(),
 772         ComedyCentralIE(),
 773         EscapistIE(),
 774         CollegeHumorIE(),
 775         XVideosIE(),
 776         SoundcloudSetIE(),
 777         SoundcloudIE(),
 778         InfoQIE(),
 779         MixcloudIE(),
 780         StanfordOpenClassroomIE(),
 781         MTVIE(),
 782         YoukuIE(),
 783         XNXXIE(),
 784         YouJizzIE(),
 785         PornotubeIE(),
 786         YouPornIE(),
 787         GooglePlusIE(),
 788         ArteTvIE(),
 789         NBAIE(),
 790         WorldStarHipHopIE(),
 791         JustinTVIE(),
 792         FunnyOrDieIE(),
 793         SteamIE(),
 794         UstreamIE(),
 795         RBMARadioIE(),
 796         EightTracksIE(),
 797         KeekIE(),
 798         TEDIE(),
 799         MySpassIE(),
 800         SpiegelIE(),
 801         LiveLeakIE(),
 802         ARDIE(),
 803         ZDFIE(),
 804         TumblrIE(),
 805         BandcampIE(),
 806         RedTubeIE(),
 807         InaIE(),
 808         HowcastIE(),
 809         VineIE(),
 810         FlickrIE(),
 811         TeamcocoIE(),
 812         XHamsterIE(),
 813         HypemIE(),
 814         Vbox7IE(),
 815         GametrailersIE(),
 816         StatigramIE(),
 817         GenericIE()
 818     ]
 819
 820 def get_info_extractor(ie_name):
 821     """Returns the info extractor class with the given ie_name"""
 822     return globals()[ie_name+'IE']