youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.funnyordie import FunnyOrDieIE
  31 from .extractor.gametrailers import GametrailersIE
  32 from .extractor.generic import GenericIE
  33 from .extractor.googleplus import GooglePlusIE
  34 from .extractor.googlesearch import GoogleSearchIE
  35 from .extractor.infoq import InfoQIE
  36 from .extractor.justintv import JustinTVIE
  37 from .extractor.metacafe import MetacafeIE
  38 from .extractor.mixcloud import MixcloudIE
  39 from .extractor.mtv import MTVIE
  40 from .extractor.myvideo import MyVideoIE
  41 from .extractor.nba import NBAIE
  42 from .extractor.statigram import StatigramIE
  43 from .extractor.photobucket import PhotobucketIE
  44 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  45 from .extractor.stanfordoc import StanfordOpenClassroomIE
  46 from .extractor.steam import SteamIE
  47 from .extractor.ted import TEDIE
  48 from .extractor.ustream import UstreamIE
  49 from .extractor.vimeo import VimeoIE
  50 from .extractor.worldstarhiphop import WorldStarHipHopIE
  51 from .extractor.xnxx import XNXXIE
  52 from .extractor.xvideos import XVideosIE
  53 from .extractor.yahoo import YahooIE, YahooSearchIE
  54 from .extractor.youku import YoukuIE
  55 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  56 from .extractor.zdf import ZDFIE
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75 class RBMARadioIE(InfoExtractor):
  76     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
  77
  78     def _real_extract(self, url):
  79         m = re.match(self._VALID_URL, url)
  80         video_id = m.group('videoID')
  81
  82         webpage = self._download_webpage(url, video_id)
  83
  84         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
  85             webpage, u'json data', flags=re.MULTILINE)
  86
  87         try:
  88             data = json.loads(json_data)
  89         except ValueError as e:
  90             raise ExtractorError(u'Invalid JSON: ' + str(e))
  91
  92         video_url = data['akamai_url'] + '&cbr=256'
  93         url_parts = compat_urllib_parse_urlparse(video_url)
  94         video_ext = url_parts.path.rpartition('.')[2]
  95         info = {
  96                 'id': video_id,
  97                 'url': video_url,
  98                 'ext': video_ext,
  99                 'title': data['title'],
 100                 'description': data.get('teaser_text'),
 101                 'location': data.get('country_of_origin'),
 102                 'uploader': data.get('host', {}).get('name'),
 103                 'uploader_id': data.get('host', {}).get('slug'),
 104                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 105                 'duration': data.get('duration'),
 106         }
 107         return [info]
 108
 109
 110 class YouPornIE(InfoExtractor):
 111     """Information extractor for youporn.com."""
 112     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 113
 114     def _print_formats(self, formats):
 115         """Print all available formats"""
 116         print(u'Available formats:')
 117         print(u'ext\t\tformat')
 118         print(u'---------------------------------')
 119         for format in formats:
 120             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 121
 122     def _specific(self, req_format, formats):
 123         for x in formats:
 124             if(x["format"]==req_format):
 125                 return x
 126         return None
 127
 128     def _real_extract(self, url):
 129         mobj = re.match(self._VALID_URL, url)
 130         if mobj is None:
 131             raise ExtractorError(u'Invalid URL: %s' % url)
 132         video_id = mobj.group('videoid')
 133
 134         req = compat_urllib_request.Request(url)
 135         req.add_header('Cookie', 'age_verified=1')
 136         webpage = self._download_webpage(req, video_id)
 137
 138         # Get JSON parameters
 139         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 140         try:
 141             params = json.loads(json_params)
 142         except:
 143             raise ExtractorError(u'Invalid JSON')
 144
 145         self.report_extraction(video_id)
 146         try:
 147             video_title = params['title']
 148             upload_date = unified_strdate(params['release_date_f'])
 149             video_description = params['description']
 150             video_uploader = params['submitted_by']
 151             thumbnail = params['thumbnails'][0]['image']
 152         except KeyError:
 153             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 154
 155         # Get all of the formats available
 156         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 157         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 158             webpage, u'download list').strip()
 159
 160         # Get all of the links from the page
 161         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 162         links = re.findall(LINK_RE, download_list_html)
 163         if(len(links) == 0):
 164             raise ExtractorError(u'ERROR: no known formats available for video')
 165
 166         self.to_screen(u'Links found: %d' % len(links))
 167
 168         formats = []
 169         for link in links:
 170
 171             # A link looks like this:
 172             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 173             # A path looks like this:
 174             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 175             video_url = unescapeHTML( link )
 176             path = compat_urllib_parse_urlparse( video_url ).path
 177             extension = os.path.splitext( path )[1][1:]
 178             format = path.split('/')[4].split('_')[:2]
 179             size = format[0]
 180             bitrate = format[1]
 181             format = "-".join( format )
 182             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 183
 184             formats.append({
 185                 'id': video_id,
 186                 'url': video_url,
 187                 'uploader': video_uploader,
 188                 'upload_date': upload_date,
 189                 'title': video_title,
 190                 'ext': extension,
 191                 'format': format,
 192                 'thumbnail': thumbnail,
 193                 'description': video_description
 194             })
 195
 196         if self._downloader.params.get('listformats', None):
 197             self._print_formats(formats)
 198             return
 199
 200         req_format = self._downloader.params.get('format', None)
 201         self.to_screen(u'Format: %s' % req_format)
 202
 203         if req_format is None or req_format == 'best':
 204             return [formats[0]]
 205         elif req_format == 'worst':
 206             return [formats[-1]]
 207         elif req_format in ('-1', 'all'):
 208             return formats
 209         else:
 210             format = self._specific( req_format, formats )
 211             if result is None:
 212                 raise ExtractorError(u'Requested format not available')
 213             return [format]
 214
 215
 216
 217 class PornotubeIE(InfoExtractor):
 218     """Information extractor for pornotube.com."""
 219     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 220
 221     def _real_extract(self, url):
 222         mobj = re.match(self._VALID_URL, url)
 223         if mobj is None:
 224             raise ExtractorError(u'Invalid URL: %s' % url)
 225
 226         video_id = mobj.group('videoid')
 227         video_title = mobj.group('title')
 228
 229         # Get webpage content
 230         webpage = self._download_webpage(url, video_id)
 231
 232         # Get the video URL
 233         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 234         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 235         video_url = compat_urllib_parse.unquote(video_url)
 236
 237         #Get the uploaded date
 238         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 239         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 240         if upload_date: upload_date = unified_strdate(upload_date)
 241
 242         info = {'id': video_id,
 243                 'url': video_url,
 244                 'uploader': None,
 245                 'upload_date': upload_date,
 246                 'title': video_title,
 247                 'ext': 'flv',
 248                 'format': 'flv'}
 249
 250         return [info]
 251
 252 class YouJizzIE(InfoExtractor):
 253     """Information extractor for youjizz.com."""
 254     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 255
 256     def _real_extract(self, url):
 257         mobj = re.match(self._VALID_URL, url)
 258         if mobj is None:
 259             raise ExtractorError(u'Invalid URL: %s' % url)
 260
 261         video_id = mobj.group('videoid')
 262
 263         # Get webpage content
 264         webpage = self._download_webpage(url, video_id)
 265
 266         # Get the video title
 267         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 268             webpage, u'title').strip()
 269
 270         # Get the embed page
 271         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 272         if result is None:
 273             raise ExtractorError(u'ERROR: unable to extract embed page')
 274
 275         embed_page_url = result.group(0).strip()
 276         video_id = result.group('videoid')
 277
 278         webpage = self._download_webpage(embed_page_url, video_id)
 279
 280         # Get the video URL
 281         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 282             webpage, u'video URL')
 283
 284         info = {'id': video_id,
 285                 'url': video_url,
 286                 'title': video_title,
 287                 'ext': 'flv',
 288                 'format': 'flv',
 289                 'player_url': embed_page_url}
 290
 291         return [info]
 292
 293 class EightTracksIE(InfoExtractor):
 294     IE_NAME = '8tracks'
 295     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 296
 297     def _real_extract(self, url):
 298         mobj = re.match(self._VALID_URL, url)
 299         if mobj is None:
 300             raise ExtractorError(u'Invalid URL: %s' % url)
 301         playlist_id = mobj.group('id')
 302
 303         webpage = self._download_webpage(url, playlist_id)
 304
 305         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 306         data = json.loads(json_like)
 307
 308         session = str(random.randint(0, 1000000000))
 309         mix_id = data['id']
 310         track_count = data['tracks_count']
 311         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 312         next_url = first_url
 313         res = []
 314         for i in itertools.count():
 315             api_json = self._download_webpage(next_url, playlist_id,
 316                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 317                 errnote=u'Failed to download song information')
 318             api_data = json.loads(api_json)
 319             track_data = api_data[u'set']['track']
 320             info = {
 321                 'id': track_data['id'],
 322                 'url': track_data['track_file_stream_url'],
 323                 'title': track_data['performer'] + u' - ' + track_data['name'],
 324                 'raw_title': track_data['name'],
 325                 'uploader_id': data['user']['login'],
 326                 'ext': 'm4a',
 327             }
 328             res.append(info)
 329             if api_data['set']['at_last_track']:
 330                 break
 331             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 332         return res
 333
 334 class KeekIE(InfoExtractor):
 335     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 336     IE_NAME = u'keek'
 337
 338     def _real_extract(self, url):
 339         m = re.match(self._VALID_URL, url)
 340         video_id = m.group('videoID')
 341
 342         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 343         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 344         webpage = self._download_webpage(url, video_id)
 345
 346         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 347             webpage, u'title')
 348
 349         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 350             webpage, u'uploader', fatal=False)
 351
 352         info = {
 353                 'id': video_id,
 354                 'url': video_url,
 355                 'ext': 'mp4',
 356                 'title': video_title,
 357                 'thumbnail': thumbnail,
 358                 'uploader': uploader
 359         }
 360         return [info]
 361
 362
 363 class MySpassIE(InfoExtractor):
 364     _VALID_URL = r'http://www.myspass.de/.*'
 365
 366     def _real_extract(self, url):
 367         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 368
 369         # video id is the last path element of the URL
 370         # usually there is a trailing slash, so also try the second but last
 371         url_path = compat_urllib_parse_urlparse(url).path
 372         url_parent_path, video_id = os.path.split(url_path)
 373         if not video_id:
 374             _, video_id = os.path.split(url_parent_path)
 375
 376         # get metadata
 377         metadata_url = META_DATA_URL_TEMPLATE % video_id
 378         metadata_text = self._download_webpage(metadata_url, video_id)
 379         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 380
 381         # extract values from metadata
 382         url_flv_el = metadata.find('url_flv')
 383         if url_flv_el is None:
 384             raise ExtractorError(u'Unable to extract download url')
 385         video_url = url_flv_el.text
 386         extension = os.path.splitext(video_url)[1][1:]
 387         title_el = metadata.find('title')
 388         if title_el is None:
 389             raise ExtractorError(u'Unable to extract title')
 390         title = title_el.text
 391         format_id_el = metadata.find('format_id')
 392         if format_id_el is None:
 393             format = ext
 394         else:
 395             format = format_id_el.text
 396         description_el = metadata.find('description')
 397         if description_el is not None:
 398             description = description_el.text
 399         else:
 400             description = None
 401         imagePreview_el = metadata.find('imagePreview')
 402         if imagePreview_el is not None:
 403             thumbnail = imagePreview_el.text
 404         else:
 405             thumbnail = None
 406         info = {
 407             'id': video_id,
 408             'url': video_url,
 409             'title': title,
 410             'ext': extension,
 411             'format': format,
 412             'thumbnail': thumbnail,
 413             'description': description
 414         }
 415         return [info]
 416
 417 class SpiegelIE(InfoExtractor):
 418     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 419
 420     def _real_extract(self, url):
 421         m = re.match(self._VALID_URL, url)
 422         video_id = m.group('videoID')
 423
 424         webpage = self._download_webpage(url, video_id)
 425
 426         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 427             webpage, u'title')
 428
 429         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 430         xml_code = self._download_webpage(xml_url, video_id,
 431                     note=u'Downloading XML', errnote=u'Failed to download XML')
 432
 433         idoc = xml.etree.ElementTree.fromstring(xml_code)
 434         last_type = idoc[-1]
 435         filename = last_type.findall('./filename')[0].text
 436         duration = float(last_type.findall('./duration')[0].text)
 437
 438         video_url = 'http://video2.spiegel.de/flash/' + filename
 439         video_ext = filename.rpartition('.')[2]
 440         info = {
 441             'id': video_id,
 442             'url': video_url,
 443             'ext': video_ext,
 444             'title': video_title,
 445             'duration': duration,
 446         }
 447         return [info]
 448
 449 class LiveLeakIE(InfoExtractor):
 450
 451     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 452     IE_NAME = u'liveleak'
 453
 454     def _real_extract(self, url):
 455         mobj = re.match(self._VALID_URL, url)
 456         if mobj is None:
 457             raise ExtractorError(u'Invalid URL: %s' % url)
 458
 459         video_id = mobj.group('video_id')
 460
 461         webpage = self._download_webpage(url, video_id)
 462
 463         video_url = self._search_regex(r'file: "(.*?)",',
 464             webpage, u'video URL')
 465
 466         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 467             webpage, u'title').replace('LiveLeak.com -', '').strip()
 468
 469         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 470             webpage, u'description', fatal=False)
 471
 472         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 473             webpage, u'uploader', fatal=False)
 474
 475         info = {
 476             'id':  video_id,
 477             'url': video_url,
 478             'ext': 'mp4',
 479             'title': video_title,
 480             'description': video_description,
 481             'uploader': video_uploader
 482         }
 483
 484         return [info]
 485
 486
 487
 488 class TumblrIE(InfoExtractor):
 489     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 490
 491     def _real_extract(self, url):
 492         m_url = re.match(self._VALID_URL, url)
 493         video_id = m_url.group('id')
 494         blog = m_url.group('blog_name')
 495
 496         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 497         webpage = self._download_webpage(url, video_id)
 498
 499         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 500         video = re.search(re_video, webpage)
 501         if video is None:
 502            raise ExtractorError(u'Unable to extract video')
 503         video_url = video.group('video_url')
 504         ext = video.group('ext')
 505
 506         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 507             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 508         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 509
 510         # The only place where you can get a title, it's not complete,
 511         # but searching in other places doesn't work for all videos
 512         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 513             webpage, u'title', flags=re.DOTALL)
 514
 515         return [{'id': video_id,
 516                  'url': video_url,
 517                  'title': video_title,
 518                  'thumbnail': video_thumbnail,
 519                  'ext': ext
 520                  }]
 521
 522 class BandcampIE(InfoExtractor):
 523     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 524
 525     def _real_extract(self, url):
 526         mobj = re.match(self._VALID_URL, url)
 527         title = mobj.group('title')
 528         webpage = self._download_webpage(url, title)
 529         # We get the link to the free download page
 530         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 531         if m_download is None:
 532             raise ExtractorError(u'No free songs found')
 533
 534         download_link = m_download.group(1)
 535         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 536                        webpage, re.MULTILINE|re.DOTALL).group('id')
 537
 538         download_webpage = self._download_webpage(download_link, id,
 539                                                   'Downloading free downloads page')
 540         # We get the dictionary of the track from some javascrip code
 541         info = re.search(r'items: (.*?),$',
 542                          download_webpage, re.MULTILINE).group(1)
 543         info = json.loads(info)[0]
 544         # We pick mp3-320 for now, until format selection can be easily implemented.
 545         mp3_info = info[u'downloads'][u'mp3-320']
 546         # If we try to use this url it says the link has expired
 547         initial_url = mp3_info[u'url']
 548         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 549         m_url = re.match(re_url, initial_url)
 550         #We build the url we will use to get the final track url
 551         # This url is build in Bandcamp in the script download_bunde_*.js
 552         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 553         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 554         # If we could correctly generate the .rand field the url would be
 555         #in the "download_url" key
 556         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 557
 558         track_info = {'id':id,
 559                       'title' : info[u'title'],
 560                       'ext' :   'mp3',
 561                       'url' :   final_url,
 562                       'thumbnail' : info[u'thumb_url'],
 563                       'uploader' :  info[u'artist']
 564                       }
 565
 566         return [track_info]
 567
 568 class RedTubeIE(InfoExtractor):
 569     """Information Extractor for redtube"""
 570     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 571
 572     def _real_extract(self,url):
 573         mobj = re.match(self._VALID_URL, url)
 574         if mobj is None:
 575             raise ExtractorError(u'Invalid URL: %s' % url)
 576
 577         video_id = mobj.group('id')
 578         video_extension = 'mp4'
 579         webpage = self._download_webpage(url, video_id)
 580
 581         self.report_extraction(video_id)
 582
 583         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 584             webpage, u'video URL')
 585
 586         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 587             webpage, u'title')
 588
 589         return [{
 590             'id':       video_id,
 591             'url':      video_url,
 592             'ext':      video_extension,
 593             'title':    video_title,
 594         }]
 595
 596 class InaIE(InfoExtractor):
 597     """Information Extractor for Ina.fr"""
 598     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
 599
 600     def _real_extract(self,url):
 601         mobj = re.match(self._VALID_URL, url)
 602
 603         video_id = mobj.group('id')
 604         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
 605         video_extension = 'mp4'
 606         webpage = self._download_webpage(mrss_url, video_id)
 607
 608         self.report_extraction(video_id)
 609
 610         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
 611             webpage, u'video URL')
 612
 613         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
 614             webpage, u'title')
 615
 616         return [{
 617             'id':       video_id,
 618             'url':      video_url,
 619             'ext':      video_extension,
 620             'title':    video_title,
 621         }]
 622
 623 class HowcastIE(InfoExtractor):
 624     """Information Extractor for Howcast.com"""
 625     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
 626
 627     def _real_extract(self, url):
 628         mobj = re.match(self._VALID_URL, url)
 629
 630         video_id = mobj.group('id')
 631         webpage_url = 'http://www.howcast.com/videos/' + video_id
 632         webpage = self._download_webpage(webpage_url, video_id)
 633
 634         self.report_extraction(video_id)
 635
 636         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
 637             webpage, u'video URL')
 638
 639         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
 640             webpage, u'title')
 641
 642         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
 643             webpage, u'description', fatal=False)
 644
 645         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
 646             webpage, u'thumbnail', fatal=False)
 647
 648         return [{
 649             'id':       video_id,
 650             'url':      video_url,
 651             'ext':      'mp4',
 652             'title':    video_title,
 653             'description': video_description,
 654             'thumbnail': thumbnail,
 655         }]
 656
 657 class VineIE(InfoExtractor):
 658     """Information Extractor for Vine.co"""
 659     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
 660
 661     def _real_extract(self, url):
 662         mobj = re.match(self._VALID_URL, url)
 663
 664         video_id = mobj.group('id')
 665         webpage_url = 'https://vine.co/v/' + video_id
 666         webpage = self._download_webpage(webpage_url, video_id)
 667
 668         self.report_extraction(video_id)
 669
 670         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
 671             webpage, u'video URL')
 672
 673         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 674             webpage, u'title')
 675
 676         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
 677             webpage, u'thumbnail', fatal=False)
 678
 679         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
 680             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 681
 682         return [{
 683             'id':        video_id,
 684             'url':       video_url,
 685             'ext':       'mp4',
 686             'title':     video_title,
 687             'thumbnail': thumbnail,
 688             'uploader':  uploader,
 689         }]
 690
 691 class FlickrIE(InfoExtractor):
 692     """Information Extractor for Flickr videos"""
 693     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
 694
 695     def _real_extract(self, url):
 696         mobj = re.match(self._VALID_URL, url)
 697
 698         video_id = mobj.group('id')
 699         video_uploader_id = mobj.group('uploader_id')
 700         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
 701         webpage = self._download_webpage(webpage_url, video_id)
 702
 703         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
 704
 705         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
 706         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
 707
 708         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
 709             first_xml, u'node_id')
 710
 711         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
 712         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
 713
 714         self.report_extraction(video_id)
 715
 716         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
 717         if mobj is None:
 718             raise ExtractorError(u'Unable to extract video url')
 719         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
 720
 721         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
 722             webpage, u'video title')
 723
 724         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
 725             webpage, u'description', fatal=False)
 726
 727         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
 728             webpage, u'thumbnail', fatal=False)
 729
 730         return [{
 731             'id':          video_id,
 732             'url':         video_url,
 733             'ext':         'mp4',
 734             'title':       video_title,
 735             'description': video_description,
 736             'thumbnail':   thumbnail,
 737             'uploader_id': video_uploader_id,
 738         }]
 739
 740 class TeamcocoIE(InfoExtractor):
 741     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
 742
 743     def _real_extract(self, url):
 744         mobj = re.match(self._VALID_URL, url)
 745         if mobj is None:
 746             raise ExtractorError(u'Invalid URL: %s' % url)
 747         url_title = mobj.group('url_title')
 748         webpage = self._download_webpage(url, url_title)
 749
 750         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
 751             webpage, u'video id')
 752
 753         self.report_extraction(video_id)
 754
 755         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
 756             webpage, u'title')
 757
 758         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
 759             webpage, u'thumbnail', fatal=False)
 760
 761         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
 762             webpage, u'description', fatal=False)
 763
 764         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
 765         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
 766
 767         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
 768             data, u'video URL')
 769
 770         return [{
 771             'id':          video_id,
 772             'url':         video_url,
 773             'ext':         'mp4',
 774             'title':       video_title,
 775             'thumbnail':   thumbnail,
 776             'description': video_description,
 777         }]
 778
 779 class XHamsterIE(InfoExtractor):
 780     """Information Extractor for xHamster"""
 781     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
 782
 783     def _real_extract(self,url):
 784         mobj = re.match(self._VALID_URL, url)
 785
 786         video_id = mobj.group('id')
 787         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
 788         webpage = self._download_webpage(mrss_url, video_id)
 789
 790         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
 791         if mobj is None:
 792             raise ExtractorError(u'Unable to extract media URL')
 793         if len(mobj.group('server')) == 0:
 794             video_url = compat_urllib_parse.unquote(mobj.group('file'))
 795         else:
 796             video_url = mobj.group('server')+'/key='+mobj.group('file')
 797         video_extension = video_url.split('.')[-1]
 798
 799         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
 800             webpage, u'title')
 801
 802         # Can't see the description anywhere in the UI
 803         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
 804         #     webpage, u'description', fatal=False)
 805         # if video_description: video_description = unescapeHTML(video_description)
 806
 807         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
 808         if mobj:
 809             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
 810         else:
 811             video_upload_date = None
 812             self._downloader.report_warning(u'Unable to extract upload date')
 813
 814         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
 815             webpage, u'uploader id', default=u'anonymous')
 816
 817         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
 818             webpage, u'thumbnail', fatal=False)
 819
 820         return [{
 821             'id':       video_id,
 822             'url':      video_url,
 823             'ext':      video_extension,
 824             'title':    video_title,
 825             # 'description': video_description,
 826             'upload_date': video_upload_date,
 827             'uploader_id': video_uploader_id,
 828             'thumbnail': video_thumbnail
 829         }]
 830
 831 class HypemIE(InfoExtractor):
 832     """Information Extractor for hypem"""
 833     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
 834
 835     def _real_extract(self, url):
 836         mobj = re.match(self._VALID_URL, url)
 837         if mobj is None:
 838             raise ExtractorError(u'Invalid URL: %s' % url)
 839         track_id = mobj.group(1)
 840
 841         data = { 'ax': 1, 'ts': time.time() }
 842         data_encoded = compat_urllib_parse.urlencode(data)
 843         complete_url = url + "?" + data_encoded
 844         request = compat_urllib_request.Request(complete_url)
 845         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
 846         cookie = urlh.headers.get('Set-Cookie', '')
 847
 848         self.report_extraction(track_id)
 849
 850         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
 851             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
 852         try:
 853             track_list = json.loads(html_tracks)
 854             track = track_list[u'tracks'][0]
 855         except ValueError:
 856             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 857
 858         key = track[u"key"]
 859         track_id = track[u"id"]
 860         artist = track[u"artist"]
 861         title = track[u"song"]
 862
 863         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
 864         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
 865         request.add_header('cookie', cookie)
 866         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
 867         try:
 868             song_data = json.loads(song_data_json)
 869         except ValueError:
 870             raise ExtractorError(u'Hypemachine contained invalid JSON.')
 871         final_url = song_data[u"url"]
 872
 873         return [{
 874             'id':       track_id,
 875             'url':      final_url,
 876             'ext':      "mp3",
 877             'title':    title,
 878             'artist':   artist,
 879         }]
 880
 881 class Vbox7IE(InfoExtractor):
 882     """Information Extractor for Vbox7"""
 883     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
 884
 885     def _real_extract(self,url):
 886         mobj = re.match(self._VALID_URL, url)
 887         if mobj is None:
 888             raise ExtractorError(u'Invalid URL: %s' % url)
 889         video_id = mobj.group(1)
 890
 891         redirect_page, urlh = self._download_webpage_handle(url, video_id)
 892         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
 893         redirect_url = urlh.geturl() + new_location
 894         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
 895
 896         title = self._html_search_regex(r'<title>(.*)</title>',
 897             webpage, u'title').split('/')[0].strip()
 898
 899         ext = "flv"
 900         info_url = "http://vbox7.com/play/magare.do"
 901         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
 902         info_request = compat_urllib_request.Request(info_url, data)
 903         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 904         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
 905         if info_response is None:
 906             raise ExtractorError(u'Unable to extract the media url')
 907         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
 908
 909         return [{
 910             'id':        video_id,
 911             'url':       final_url,
 912             'ext':       ext,
 913             'title':     title,
 914             'thumbnail': thumbnail_url,
 915         }]
 916
 917
 918 def gen_extractors():
 919     """ Return a list of an instance of every supported extractor.
 920     The order does matter; the first extractor matched is the one handling the URL.
 921     """
 922     return [
 923         YoutubePlaylistIE(),
 924         YoutubeChannelIE(),
 925         YoutubeUserIE(),
 926         YoutubeSearchIE(),
 927         YoutubeIE(),
 928         MetacafeIE(),
 929         DailymotionIE(),
 930         GoogleSearchIE(),
 931         PhotobucketIE(),
 932         YahooIE(),
 933         YahooSearchIE(),
 934         DepositFilesIE(),
 935         FacebookIE(),
 936         BlipTVIE(),
 937         BlipTVUserIE(),
 938         VimeoIE(),
 939         MyVideoIE(),
 940         ComedyCentralIE(),
 941         EscapistIE(),
 942         CollegeHumorIE(),
 943         XVideosIE(),
 944         SoundcloudSetIE(),
 945         SoundcloudIE(),
 946         InfoQIE(),
 947         MixcloudIE(),
 948         StanfordOpenClassroomIE(),
 949         MTVIE(),
 950         YoukuIE(),
 951         XNXXIE(),
 952         YouJizzIE(),
 953         PornotubeIE(),
 954         YouPornIE(),
 955         GooglePlusIE(),
 956         ArteTvIE(),
 957         NBAIE(),
 958         WorldStarHipHopIE(),
 959         JustinTVIE(),
 960         FunnyOrDieIE(),
 961         SteamIE(),
 962         UstreamIE(),
 963         RBMARadioIE(),
 964         EightTracksIE(),
 965         KeekIE(),
 966         TEDIE(),
 967         MySpassIE(),
 968         SpiegelIE(),
 969         LiveLeakIE(),
 970         ARDIE(),
 971         ZDFIE(),
 972         TumblrIE(),
 973         BandcampIE(),
 974         RedTubeIE(),
 975         InaIE(),
 976         HowcastIE(),
 977         VineIE(),
 978         FlickrIE(),
 979         TeamcocoIE(),
 980         XHamsterIE(),
 981         HypemIE(),
 982         Vbox7IE(),
 983         GametrailersIE(),
 984         StatigramIE(),
 985         GenericIE()
 986     ]
 987
 988 def get_info_extractor(ie_name):
 989     """Returns the info extractor class with the given ie_name"""
 990     return globals()[ie_name+'IE']