]> gitweb @ CieloNegro.org - youtube-dl.git/blob - youtube_dl/InfoExtractors.py
a1c1298df286cfc4c73881034acb9ba89b2bb8f0
[youtube-dl.git] / youtube_dl / InfoExtractors.py
1 import base64
2 import datetime
3 import itertools
4 import netrc
5 import os
6 import re
7 import socket
8 import time
9 import email.utils
10 import xml.etree.ElementTree
11 import random
12 import math
13 import operator
14 import hashlib
15 import binascii
16 import urllib
17
18 from .utils import *
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
20
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.googleplus import GooglePlusIE
29 from .extractor.googlesearch import GoogleSearchIE
30 from .extractor.metacafe import MetacafeIE
31 from .extractor.myvideo import MyVideoIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
35 from .extractor.vimeo import VimeoIE
36 from .extractor.yahoo import YahooIE, YahooSearchIE
37 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
38 from .extractor.zdf import ZDFIE
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58 class DepositFilesIE(InfoExtractor):
59     """Information extractor for depositfiles.com"""
60
61     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
62
63     def _real_extract(self, url):
64         file_id = url.split('/')[-1]
65         # Rebuild url in english locale
66         url = 'http://depositfiles.com/en/files/' + file_id
67
68         # Retrieve file webpage with 'Free download' button pressed
69         free_download_indication = { 'gateway_result' : '1' }
70         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
71         try:
72             self.report_download_webpage(file_id)
73             webpage = compat_urllib_request.urlopen(request).read()
74         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
75             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
76
77         # Search for the real file URL
78         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
79         if (mobj is None) or (mobj.group(1) is None):
80             # Try to figure out reason of the error.
81             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
82             if (mobj is not None) and (mobj.group(1) is not None):
83                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
84                 raise ExtractorError(u'%s' % restriction_message)
85             else:
86                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
87
88         file_url = mobj.group(1)
89         file_extension = os.path.splitext(file_url)[1][1:]
90
91         # Search for file title
92         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
93
94         return [{
95             'id':       file_id.decode('utf-8'),
96             'url':      file_url.decode('utf-8'),
97             'uploader': None,
98             'upload_date':  None,
99             'title':    file_title,
100             'ext':      file_extension.decode('utf-8'),
101         }]
102
103
104 class FacebookIE(InfoExtractor):
105     """Information Extractor for Facebook"""
106
107     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
108     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
109     _NETRC_MACHINE = 'facebook'
110     IE_NAME = u'facebook'
111
112     def report_login(self):
113         """Report attempt to log in."""
114         self.to_screen(u'Logging in')
115
116     def _real_initialize(self):
117         if self._downloader is None:
118             return
119
120         useremail = None
121         password = None
122         downloader_params = self._downloader.params
123
124         # Attempt to use provided username and password or .netrc data
125         if downloader_params.get('username', None) is not None:
126             useremail = downloader_params['username']
127             password = downloader_params['password']
128         elif downloader_params.get('usenetrc', False):
129             try:
130                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
131                 if info is not None:
132                     useremail = info[0]
133                     password = info[2]
134                 else:
135                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
136             except (IOError, netrc.NetrcParseError) as err:
137                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
138                 return
139
140         if useremail is None:
141             return
142
143         # Log in
144         login_form = {
145             'email': useremail,
146             'pass': password,
147             'login': 'Log+In'
148             }
149         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
150         try:
151             self.report_login()
152             login_results = compat_urllib_request.urlopen(request).read()
153             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
154                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
155                 return
156         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
157             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
158             return
159
160     def _real_extract(self, url):
161         mobj = re.match(self._VALID_URL, url)
162         if mobj is None:
163             raise ExtractorError(u'Invalid URL: %s' % url)
164         video_id = mobj.group('ID')
165
166         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
167         webpage = self._download_webpage(url, video_id)
168
169         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
170         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
171         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
172         if not m:
173             raise ExtractorError(u'Cannot parse data')
174         data = dict(json.loads(m.group(1)))
175         params_raw = compat_urllib_parse.unquote(data['params'])
176         params = json.loads(params_raw)
177         video_data = params['video_data'][0]
178         video_url = video_data.get('hd_src')
179         if not video_url:
180             video_url = video_data['sd_src']
181         if not video_url:
182             raise ExtractorError(u'Cannot find video URL')
183         video_duration = int(video_data['video_duration'])
184         thumbnail = video_data['thumbnail_src']
185
186         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
187             webpage, u'title')
188
189         info = {
190             'id': video_id,
191             'title': video_title,
192             'url': video_url,
193             'ext': 'mp4',
194             'duration': video_duration,
195             'thumbnail': thumbnail,
196         }
197         return [info]
198
199
200
201
202
203
204
205 class EscapistIE(InfoExtractor):
206     """Information extractor for The Escapist """
207
208     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
209     IE_NAME = u'escapist'
210
211     def _real_extract(self, url):
212         mobj = re.match(self._VALID_URL, url)
213         if mobj is None:
214             raise ExtractorError(u'Invalid URL: %s' % url)
215         showName = mobj.group('showname')
216         videoId = mobj.group('episode')
217
218         self.report_extraction(videoId)
219         webpage = self._download_webpage(url, videoId)
220
221         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
222             webpage, u'description', fatal=False)
223
224         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
225             webpage, u'thumbnail', fatal=False)
226
227         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
228             webpage, u'player url')
229
230         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
231             webpage, u'player url').split(' : ')[-1]
232
233         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
234         configUrl = compat_urllib_parse.unquote(configUrl)
235
236         configJSON = self._download_webpage(configUrl, videoId,
237                                             u'Downloading configuration',
238                                             u'unable to download configuration')
239
240         # Technically, it's JavaScript, not JSON
241         configJSON = configJSON.replace("'", '"')
242
243         try:
244             config = json.loads(configJSON)
245         except (ValueError,) as err:
246             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
247
248         playlist = config['playlist']
249         videoUrl = playlist[1]['url']
250
251         info = {
252             'id': videoId,
253             'url': videoUrl,
254             'uploader': showName,
255             'upload_date': None,
256             'title': title,
257             'ext': 'mp4',
258             'thumbnail': imgUrl,
259             'description': videoDesc,
260             'player_url': playerUrl,
261         }
262
263         return [info]
264
265 class CollegeHumorIE(InfoExtractor):
266     """Information extractor for collegehumor.com"""
267
268     _WORKING = False
269     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
270     IE_NAME = u'collegehumor'
271
272     def report_manifest(self, video_id):
273         """Report information extraction."""
274         self.to_screen(u'%s: Downloading XML manifest' % video_id)
275
276     def _real_extract(self, url):
277         mobj = re.match(self._VALID_URL, url)
278         if mobj is None:
279             raise ExtractorError(u'Invalid URL: %s' % url)
280         video_id = mobj.group('videoid')
281
282         info = {
283             'id': video_id,
284             'uploader': None,
285             'upload_date': None,
286         }
287
288         self.report_extraction(video_id)
289         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
290         try:
291             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
292         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
294
295         mdoc = xml.etree.ElementTree.fromstring(metaXml)
296         try:
297             videoNode = mdoc.findall('./video')[0]
298             info['description'] = videoNode.findall('./description')[0].text
299             info['title'] = videoNode.findall('./caption')[0].text
300             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
301             manifest_url = videoNode.findall('./file')[0].text
302         except IndexError:
303             raise ExtractorError(u'Invalid metadata XML file')
304
305         manifest_url += '?hdcore=2.10.3'
306         self.report_manifest(video_id)
307         try:
308             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
309         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
311
312         adoc = xml.etree.ElementTree.fromstring(manifestXml)
313         try:
314             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
315             node_id = media_node.attrib['url']
316             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
317         except IndexError as err:
318             raise ExtractorError(u'Invalid manifest file')
319
320         url_pr = compat_urllib_parse_urlparse(manifest_url)
321         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
322
323         info['url'] = url
324         info['ext'] = 'f4f'
325         return [info]
326
327
328 class XVideosIE(InfoExtractor):
329     """Information extractor for xvideos.com"""
330
331     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
332     IE_NAME = u'xvideos'
333
334     def _real_extract(self, url):
335         mobj = re.match(self._VALID_URL, url)
336         if mobj is None:
337             raise ExtractorError(u'Invalid URL: %s' % url)
338         video_id = mobj.group(1)
339
340         webpage = self._download_webpage(url, video_id)
341
342         self.report_extraction(video_id)
343
344         # Extract video URL
345         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
346             webpage, u'video URL'))
347
348         # Extract title
349         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
350             webpage, u'title')
351
352         # Extract video thumbnail
353         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
354             webpage, u'thumbnail', fatal=False)
355
356         info = {
357             'id': video_id,
358             'url': video_url,
359             'uploader': None,
360             'upload_date': None,
361             'title': video_title,
362             'ext': 'flv',
363             'thumbnail': video_thumbnail,
364             'description': None,
365         }
366
367         return [info]
368
369
370
371
372 class InfoQIE(InfoExtractor):
373     """Information extractor for infoq.com"""
374     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
375
376     def _real_extract(self, url):
377         mobj = re.match(self._VALID_URL, url)
378         if mobj is None:
379             raise ExtractorError(u'Invalid URL: %s' % url)
380
381         webpage = self._download_webpage(url, video_id=url)
382         self.report_extraction(url)
383
384         # Extract video URL
385         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
386         if mobj is None:
387             raise ExtractorError(u'Unable to extract video url')
388         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
389         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
390
391         # Extract title
392         video_title = self._search_regex(r'contentTitle = "(.*?)";',
393             webpage, u'title')
394
395         # Extract description
396         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
397             webpage, u'description', fatal=False)
398
399         video_filename = video_url.split('/')[-1]
400         video_id, extension = video_filename.split('.')
401
402         info = {
403             'id': video_id,
404             'url': video_url,
405             'uploader': None,
406             'upload_date': None,
407             'title': video_title,
408             'ext': extension, # Extension is always(?) mp4, but seems to be flv
409             'thumbnail': None,
410             'description': video_description,
411         }
412
413         return [info]
414
415 class MixcloudIE(InfoExtractor):
416     """Information extractor for www.mixcloud.com"""
417
418     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
419     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
420     IE_NAME = u'mixcloud'
421
422     def report_download_json(self, file_id):
423         """Report JSON download."""
424         self.to_screen(u'Downloading json')
425
426     def get_urls(self, jsonData, fmt, bitrate='best'):
427         """Get urls from 'audio_formats' section in json"""
428         file_url = None
429         try:
430             bitrate_list = jsonData[fmt]
431             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
432                 bitrate = max(bitrate_list) # select highest
433
434             url_list = jsonData[fmt][bitrate]
435         except TypeError: # we have no bitrate info.
436             url_list = jsonData[fmt]
437         return url_list
438
439     def check_urls(self, url_list):
440         """Returns 1st active url from list"""
441         for url in url_list:
442             try:
443                 compat_urllib_request.urlopen(url)
444                 return url
445             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446                 url = None
447
448         return None
449
450     def _print_formats(self, formats):
451         print('Available formats:')
452         for fmt in formats.keys():
453             for b in formats[fmt]:
454                 try:
455                     ext = formats[fmt][b][0]
456                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
457                 except TypeError: # we have no bitrate info
458                     ext = formats[fmt][0]
459                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
460                     break
461
462     def _real_extract(self, url):
463         mobj = re.match(self._VALID_URL, url)
464         if mobj is None:
465             raise ExtractorError(u'Invalid URL: %s' % url)
466         # extract uploader & filename from url
467         uploader = mobj.group(1).decode('utf-8')
468         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
469
470         # construct API request
471         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
472         # retrieve .json file with links to files
473         request = compat_urllib_request.Request(file_url)
474         try:
475             self.report_download_json(file_url)
476             jsonData = compat_urllib_request.urlopen(request).read()
477         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
478             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
479
480         # parse JSON
481         json_data = json.loads(jsonData)
482         player_url = json_data['player_swf_url']
483         formats = dict(json_data['audio_formats'])
484
485         req_format = self._downloader.params.get('format', None)
486         bitrate = None
487
488         if self._downloader.params.get('listformats', None):
489             self._print_formats(formats)
490             return
491
492         if req_format is None or req_format == 'best':
493             for format_param in formats.keys():
494                 url_list = self.get_urls(formats, format_param)
495                 # check urls
496                 file_url = self.check_urls(url_list)
497                 if file_url is not None:
498                     break # got it!
499         else:
500             if req_format not in formats:
501                 raise ExtractorError(u'Format is not available')
502
503             url_list = self.get_urls(formats, req_format)
504             file_url = self.check_urls(url_list)
505             format_param = req_format
506
507         return [{
508             'id': file_id.decode('utf-8'),
509             'url': file_url.decode('utf-8'),
510             'uploader': uploader.decode('utf-8'),
511             'upload_date': None,
512             'title': json_data['name'],
513             'ext': file_url.split('.')[-1].decode('utf-8'),
514             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
515             'thumbnail': json_data['thumbnail_url'],
516             'description': json_data['description'],
517             'player_url': player_url.decode('utf-8'),
518         }]
519
520 class StanfordOpenClassroomIE(InfoExtractor):
521     """Information extractor for Stanford's Open ClassRoom"""
522
523     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
524     IE_NAME = u'stanfordoc'
525
526     def _real_extract(self, url):
527         mobj = re.match(self._VALID_URL, url)
528         if mobj is None:
529             raise ExtractorError(u'Invalid URL: %s' % url)
530
531         if mobj.group('course') and mobj.group('video'): # A specific video
532             course = mobj.group('course')
533             video = mobj.group('video')
534             info = {
535                 'id': course + '_' + video,
536                 'uploader': None,
537                 'upload_date': None,
538             }
539
540             self.report_extraction(info['id'])
541             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
542             xmlUrl = baseUrl + video + '.xml'
543             try:
544                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
545             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
546                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
547             mdoc = xml.etree.ElementTree.fromstring(metaXml)
548             try:
549                 info['title'] = mdoc.findall('./title')[0].text
550                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
551             except IndexError:
552                 raise ExtractorError(u'Invalid metadata XML file')
553             info['ext'] = info['url'].rpartition('.')[2]
554             return [info]
555         elif mobj.group('course'): # A course page
556             course = mobj.group('course')
557             info = {
558                 'id': course,
559                 'type': 'playlist',
560                 'uploader': None,
561                 'upload_date': None,
562             }
563
564             coursepage = self._download_webpage(url, info['id'],
565                                         note='Downloading course info page',
566                                         errnote='Unable to download course info page')
567
568             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
569
570             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
571                 coursepage, u'description', fatal=False)
572
573             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
574             info['list'] = [
575                 {
576                     'type': 'reference',
577                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
578                 }
579                     for vpage in links]
580             results = []
581             for entry in info['list']:
582                 assert entry['type'] == 'reference'
583                 results += self.extract(entry['url'])
584             return results
585         else: # Root page
586             info = {
587                 'id': 'Stanford OpenClassroom',
588                 'type': 'playlist',
589                 'uploader': None,
590                 'upload_date': None,
591             }
592
593             self.report_download_webpage(info['id'])
594             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
595             try:
596                 rootpage = compat_urllib_request.urlopen(rootURL).read()
597             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
599
600             info['title'] = info['id']
601
602             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
603             info['list'] = [
604                 {
605                     'type': 'reference',
606                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
607                 }
608                     for cpage in links]
609
610             results = []
611             for entry in info['list']:
612                 assert entry['type'] == 'reference'
613                 results += self.extract(entry['url'])
614             return results
615
616 class MTVIE(InfoExtractor):
617     """Information extractor for MTV.com"""
618
619     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
620     IE_NAME = u'mtv'
621
622     def _real_extract(self, url):
623         mobj = re.match(self._VALID_URL, url)
624         if mobj is None:
625             raise ExtractorError(u'Invalid URL: %s' % url)
626         if not mobj.group('proto'):
627             url = 'http://' + url
628         video_id = mobj.group('videoid')
629
630         webpage = self._download_webpage(url, video_id)
631
632         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
633             webpage, u'song name', fatal=False)
634
635         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
636             webpage, u'title')
637
638         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
639             webpage, u'mtvn_uri', fatal=False)
640
641         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
642             webpage, u'content id', fatal=False)
643
644         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
645         self.report_extraction(video_id)
646         request = compat_urllib_request.Request(videogen_url)
647         try:
648             metadataXml = compat_urllib_request.urlopen(request).read()
649         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
650             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
651
652         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
653         renditions = mdoc.findall('.//rendition')
654
655         # For now, always pick the highest quality.
656         rendition = renditions[-1]
657
658         try:
659             _,_,ext = rendition.attrib['type'].partition('/')
660             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
661             video_url = rendition.find('./src').text
662         except KeyError:
663             raise ExtractorError('Invalid rendition field.')
664
665         info = {
666             'id': video_id,
667             'url': video_url,
668             'uploader': performer,
669             'upload_date': None,
670             'title': video_title,
671             'ext': ext,
672             'format': format,
673         }
674
675         return [info]
676
677
678 class YoukuIE(InfoExtractor):
679     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
680
681     def _gen_sid(self):
682         nowTime = int(time.time() * 1000)
683         random1 = random.randint(1000,1998)
684         random2 = random.randint(1000,9999)
685
686         return "%d%d%d" %(nowTime,random1,random2)
687
688     def _get_file_ID_mix_string(self, seed):
689         mixed = []
690         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
691         seed = float(seed)
692         for i in range(len(source)):
693             seed  =  (seed * 211 + 30031 ) % 65536
694             index  =  math.floor(seed / 65536 * len(source) )
695             mixed.append(source[int(index)])
696             source.remove(source[int(index)])
697         #return ''.join(mixed)
698         return mixed
699
700     def _get_file_id(self, fileId, seed):
701         mixed = self._get_file_ID_mix_string(seed)
702         ids = fileId.split('*')
703         realId = []
704         for ch in ids:
705             if ch:
706                 realId.append(mixed[int(ch)])
707         return ''.join(realId)
708
709     def _real_extract(self, url):
710         mobj = re.match(self._VALID_URL, url)
711         if mobj is None:
712             raise ExtractorError(u'Invalid URL: %s' % url)
713         video_id = mobj.group('ID')
714
715         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
716
717         jsondata = self._download_webpage(info_url, video_id)
718
719         self.report_extraction(video_id)
720         try:
721             config = json.loads(jsondata)
722
723             video_title =  config['data'][0]['title']
724             seed = config['data'][0]['seed']
725
726             format = self._downloader.params.get('format', None)
727             supported_format = list(config['data'][0]['streamfileids'].keys())
728
729             if format is None or format == 'best':
730                 if 'hd2' in supported_format:
731                     format = 'hd2'
732                 else:
733                     format = 'flv'
734                 ext = u'flv'
735             elif format == 'worst':
736                 format = 'mp4'
737                 ext = u'mp4'
738             else:
739                 format = 'flv'
740                 ext = u'flv'
741
742
743             fileid = config['data'][0]['streamfileids'][format]
744             keys = [s['k'] for s in config['data'][0]['segs'][format]]
745         except (UnicodeDecodeError, ValueError, KeyError):
746             raise ExtractorError(u'Unable to extract info section')
747
748         files_info=[]
749         sid = self._gen_sid()
750         fileid = self._get_file_id(fileid, seed)
751
752         #column 8,9 of fileid represent the segment number
753         #fileid[7:9] should be changed
754         for index, key in enumerate(keys):
755
756             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
757             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
758
759             info = {
760                 'id': '%s_part%02d' % (video_id, index),
761                 'url': download_url,
762                 'uploader': None,
763                 'upload_date': None,
764                 'title': video_title,
765                 'ext': ext,
766             }
767             files_info.append(info)
768
769         return files_info
770
771
772 class XNXXIE(InfoExtractor):
773     """Information extractor for xnxx.com"""
774
775     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
776     IE_NAME = u'xnxx'
777     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
778     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
779     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
780
781     def _real_extract(self, url):
782         mobj = re.match(self._VALID_URL, url)
783         if mobj is None:
784             raise ExtractorError(u'Invalid URL: %s' % url)
785         video_id = mobj.group(1)
786
787         # Get webpage content
788         webpage = self._download_webpage(url, video_id)
789
790         video_url = self._search_regex(self.VIDEO_URL_RE,
791             webpage, u'video URL')
792         video_url = compat_urllib_parse.unquote(video_url)
793
794         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
795             webpage, u'title')
796
797         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
798             webpage, u'thumbnail', fatal=False)
799
800         return [{
801             'id': video_id,
802             'url': video_url,
803             'uploader': None,
804             'upload_date': None,
805             'title': video_title,
806             'ext': 'flv',
807             'thumbnail': video_thumbnail,
808             'description': None,
809         }]
810
811
812
813 class NBAIE(InfoExtractor):
814     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
815     IE_NAME = u'nba'
816
817     def _real_extract(self, url):
818         mobj = re.match(self._VALID_URL, url)
819         if mobj is None:
820             raise ExtractorError(u'Invalid URL: %s' % url)
821
822         video_id = mobj.group(1)
823
824         webpage = self._download_webpage(url, video_id)
825
826         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
827
828         shortened_video_id = video_id.rpartition('/')[2]
829         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
830             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
831
832         # It isn't there in the HTML it returns to us
833         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
834
835         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
836
837         info = {
838             'id': shortened_video_id,
839             'url': video_url,
840             'ext': 'mp4',
841             'title': title,
842             # 'uploader_date': uploader_date,
843             'description': description,
844         }
845         return [info]
846
847 class JustinTVIE(InfoExtractor):
848     """Information extractor for justin.tv and twitch.tv"""
849     # TODO: One broadcast may be split into multiple videos. The key
850     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
851     # starts at 1 and increases. Can we treat all parts as one video?
852
853     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
854         (?:
855             (?P<channelid>[^/]+)|
856             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
857             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
858         )
859         /?(?:\#.*)?$
860         """
861     _JUSTIN_PAGE_LIMIT = 100
862     IE_NAME = u'justin.tv'
863
864     def report_download_page(self, channel, offset):
865         """Report attempt to download a single page of videos."""
866         self.to_screen(u'%s: Downloading video information from %d to %d' %
867                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
868
869     # Return count of items, list of *valid* items
870     def _parse_page(self, url, video_id):
871         webpage = self._download_webpage(url, video_id,
872                                          u'Downloading video info JSON',
873                                          u'unable to download video info JSON')
874
875         response = json.loads(webpage)
876         if type(response) != list:
877             error_text = response.get('error', 'unknown error')
878             raise ExtractorError(u'Justin.tv API: %s' % error_text)
879         info = []
880         for clip in response:
881             video_url = clip['video_file_url']
882             if video_url:
883                 video_extension = os.path.splitext(video_url)[1][1:]
884                 video_date = re.sub('-', '', clip['start_time'][:10])
885                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
886                 video_id = clip['id']
887                 video_title = clip.get('title', video_id)
888                 info.append({
889                     'id': video_id,
890                     'url': video_url,
891                     'title': video_title,
892                     'uploader': clip.get('channel_name', video_uploader_id),
893                     'uploader_id': video_uploader_id,
894                     'upload_date': video_date,
895                     'ext': video_extension,
896                 })
897         return (len(response), info)
898
899     def _real_extract(self, url):
900         mobj = re.match(self._VALID_URL, url)
901         if mobj is None:
902             raise ExtractorError(u'invalid URL: %s' % url)
903
904         api_base = 'http://api.justin.tv'
905         paged = False
906         if mobj.group('channelid'):
907             paged = True
908             video_id = mobj.group('channelid')
909             api = api_base + '/channel/archives/%s.json' % video_id
910         elif mobj.group('chapterid'):
911             chapter_id = mobj.group('chapterid')
912
913             webpage = self._download_webpage(url, chapter_id)
914             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
915             if not m:
916                 raise ExtractorError(u'Cannot find archive of a chapter')
917             archive_id = m.group(1)
918
919             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
920             chapter_info_xml = self._download_webpage(api, chapter_id,
921                                              note=u'Downloading chapter information',
922                                              errnote=u'Chapter information download failed')
923             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
924             for a in doc.findall('.//archive'):
925                 if archive_id == a.find('./id').text:
926                     break
927             else:
928                 raise ExtractorError(u'Could not find chapter in chapter information')
929
930             video_url = a.find('./video_file_url').text
931             video_ext = video_url.rpartition('.')[2] or u'flv'
932
933             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
934             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
935                                    note='Downloading chapter metadata',
936                                    errnote='Download of chapter metadata failed')
937             chapter_info = json.loads(chapter_info_json)
938
939             bracket_start = int(doc.find('.//bracket_start').text)
940             bracket_end = int(doc.find('.//bracket_end').text)
941
942             # TODO determine start (and probably fix up file)
943             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
944             #video_url += u'?start=' + TODO:start_timestamp
945             # bracket_start is 13290, but we want 51670615
946             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
947                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
948
949             info = {
950                 'id': u'c' + chapter_id,
951                 'url': video_url,
952                 'ext': video_ext,
953                 'title': chapter_info['title'],
954                 'thumbnail': chapter_info['preview'],
955                 'description': chapter_info['description'],
956                 'uploader': chapter_info['channel']['display_name'],
957                 'uploader_id': chapter_info['channel']['name'],
958             }
959             return [info]
960         else:
961             video_id = mobj.group('videoid')
962             api = api_base + '/broadcast/by_archive/%s.json' % video_id
963
964         self.report_extraction(video_id)
965
966         info = []
967         offset = 0
968         limit = self._JUSTIN_PAGE_LIMIT
969         while True:
970             if paged:
971                 self.report_download_page(video_id, offset)
972             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
973             page_count, page_info = self._parse_page(page_url, video_id)
974             info.extend(page_info)
975             if not paged or page_count != limit:
976                 break
977             offset += limit
978         return info
979
980 class FunnyOrDieIE(InfoExtractor):
981     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
982
983     def _real_extract(self, url):
984         mobj = re.match(self._VALID_URL, url)
985         if mobj is None:
986             raise ExtractorError(u'invalid URL: %s' % url)
987
988         video_id = mobj.group('id')
989         webpage = self._download_webpage(url, video_id)
990
991         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
992             webpage, u'video URL', flags=re.DOTALL)
993
994         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
995             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
996
997         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
998             webpage, u'description', fatal=False, flags=re.DOTALL)
999
1000         info = {
1001             'id': video_id,
1002             'url': video_url,
1003             'ext': 'mp4',
1004             'title': title,
1005             'description': video_description,
1006         }
1007         return [info]
1008
1009 class SteamIE(InfoExtractor):
1010     _VALID_URL = r"""http://store\.steampowered\.com/
1011                 (agecheck/)?
1012                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1013                 (?P<gameID>\d+)/?
1014                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1015                 """
1016     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1017     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1018
1019     @classmethod
1020     def suitable(cls, url):
1021         """Receives a URL and returns True if suitable for this IE."""
1022         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1023
1024     def _real_extract(self, url):
1025         m = re.match(self._VALID_URL, url, re.VERBOSE)
1026         gameID = m.group('gameID')
1027
1028         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1029         webpage = self._download_webpage(videourl, gameID)
1030
1031         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1032             videourl = self._AGECHECK_TEMPLATE % gameID
1033             self.report_age_confirmation()
1034             webpage = self._download_webpage(videourl, gameID)
1035
1036         self.report_extraction(gameID)
1037         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1038                                              webpage, 'game title')
1039
1040         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1041         mweb = re.finditer(urlRE, webpage)
1042         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1043         titles = re.finditer(namesRE, webpage)
1044         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1045         thumbs = re.finditer(thumbsRE, webpage)
1046         videos = []
1047         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1048             video_id = vid.group('videoID')
1049             title = vtitle.group('videoName')
1050             video_url = vid.group('videoURL')
1051             video_thumb = thumb.group('thumbnail')
1052             if not video_url:
1053                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1054             info = {
1055                 'id':video_id,
1056                 'url':video_url,
1057                 'ext': 'flv',
1058                 'title': unescapeHTML(title),
1059                 'thumbnail': video_thumb
1060                   }
1061             videos.append(info)
1062         return [self.playlist_result(videos, gameID, game_title)]
1063
1064 class UstreamIE(InfoExtractor):
1065     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1066     IE_NAME = u'ustream'
1067
1068     def _real_extract(self, url):
1069         m = re.match(self._VALID_URL, url)
1070         video_id = m.group('videoID')
1071
1072         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1073         webpage = self._download_webpage(url, video_id)
1074
1075         self.report_extraction(video_id)
1076
1077         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1078             webpage, u'title')
1079
1080         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1081             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1082
1083         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1084             webpage, u'thumbnail', fatal=False)
1085
1086         info = {
1087                 'id': video_id,
1088                 'url': video_url,
1089                 'ext': 'flv',
1090                 'title': video_title,
1091                 'uploader': uploader,
1092                 'thumbnail': thumbnail,
1093                }
1094         return info
1095
1096 class WorldStarHipHopIE(InfoExtractor):
1097     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1098     IE_NAME = u'WorldStarHipHop'
1099
1100     def _real_extract(self, url):
1101         m = re.match(self._VALID_URL, url)
1102         video_id = m.group('id')
1103
1104         webpage_src = self._download_webpage(url, video_id)
1105
1106         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1107             webpage_src, u'video URL')
1108
1109         if 'mp4' in video_url:
1110             ext = 'mp4'
1111         else:
1112             ext = 'flv'
1113
1114         video_title = self._html_search_regex(r"<title>(.*)</title>",
1115             webpage_src, u'title')
1116
1117         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1118         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1119             webpage_src, u'thumbnail', fatal=False)
1120
1121         if not thumbnail:
1122             _title = r"""candytitles.*>(.*)</span>"""
1123             mobj = re.search(_title, webpage_src)
1124             if mobj is not None:
1125                 video_title = mobj.group(1)
1126
1127         results = [{
1128                     'id': video_id,
1129                     'url' : video_url,
1130                     'title' : video_title,
1131                     'thumbnail' : thumbnail,
1132                     'ext' : ext,
1133                     }]
1134         return results
1135
1136 class RBMARadioIE(InfoExtractor):
1137     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1138
1139     def _real_extract(self, url):
1140         m = re.match(self._VALID_URL, url)
1141         video_id = m.group('videoID')
1142
1143         webpage = self._download_webpage(url, video_id)
1144
1145         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1146             webpage, u'json data', flags=re.MULTILINE)
1147
1148         try:
1149             data = json.loads(json_data)
1150         except ValueError as e:
1151             raise ExtractorError(u'Invalid JSON: ' + str(e))
1152
1153         video_url = data['akamai_url'] + '&cbr=256'
1154         url_parts = compat_urllib_parse_urlparse(video_url)
1155         video_ext = url_parts.path.rpartition('.')[2]
1156         info = {
1157                 'id': video_id,
1158                 'url': video_url,
1159                 'ext': video_ext,
1160                 'title': data['title'],
1161                 'description': data.get('teaser_text'),
1162                 'location': data.get('country_of_origin'),
1163                 'uploader': data.get('host', {}).get('name'),
1164                 'uploader_id': data.get('host', {}).get('slug'),
1165                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1166                 'duration': data.get('duration'),
1167         }
1168         return [info]
1169
1170
1171 class YouPornIE(InfoExtractor):
1172     """Information extractor for youporn.com."""
1173     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1174
1175     def _print_formats(self, formats):
1176         """Print all available formats"""
1177         print(u'Available formats:')
1178         print(u'ext\t\tformat')
1179         print(u'---------------------------------')
1180         for format in formats:
1181             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1182
1183     def _specific(self, req_format, formats):
1184         for x in formats:
1185             if(x["format"]==req_format):
1186                 return x
1187         return None
1188
1189     def _real_extract(self, url):
1190         mobj = re.match(self._VALID_URL, url)
1191         if mobj is None:
1192             raise ExtractorError(u'Invalid URL: %s' % url)
1193         video_id = mobj.group('videoid')
1194
1195         req = compat_urllib_request.Request(url)
1196         req.add_header('Cookie', 'age_verified=1')
1197         webpage = self._download_webpage(req, video_id)
1198
1199         # Get JSON parameters
1200         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1201         try:
1202             params = json.loads(json_params)
1203         except:
1204             raise ExtractorError(u'Invalid JSON')
1205
1206         self.report_extraction(video_id)
1207         try:
1208             video_title = params['title']
1209             upload_date = unified_strdate(params['release_date_f'])
1210             video_description = params['description']
1211             video_uploader = params['submitted_by']
1212             thumbnail = params['thumbnails'][0]['image']
1213         except KeyError:
1214             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1215
1216         # Get all of the formats available
1217         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1218         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1219             webpage, u'download list').strip()
1220
1221         # Get all of the links from the page
1222         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1223         links = re.findall(LINK_RE, download_list_html)
1224         if(len(links) == 0):
1225             raise ExtractorError(u'ERROR: no known formats available for video')
1226
1227         self.to_screen(u'Links found: %d' % len(links))
1228
1229         formats = []
1230         for link in links:
1231
1232             # A link looks like this:
1233             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1234             # A path looks like this:
1235             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1236             video_url = unescapeHTML( link )
1237             path = compat_urllib_parse_urlparse( video_url ).path
1238             extension = os.path.splitext( path )[1][1:]
1239             format = path.split('/')[4].split('_')[:2]
1240             size = format[0]
1241             bitrate = format[1]
1242             format = "-".join( format )
1243             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1244
1245             formats.append({
1246                 'id': video_id,
1247                 'url': video_url,
1248                 'uploader': video_uploader,
1249                 'upload_date': upload_date,
1250                 'title': video_title,
1251                 'ext': extension,
1252                 'format': format,
1253                 'thumbnail': thumbnail,
1254                 'description': video_description
1255             })
1256
1257         if self._downloader.params.get('listformats', None):
1258             self._print_formats(formats)
1259             return
1260
1261         req_format = self._downloader.params.get('format', None)
1262         self.to_screen(u'Format: %s' % req_format)
1263
1264         if req_format is None or req_format == 'best':
1265             return [formats[0]]
1266         elif req_format == 'worst':
1267             return [formats[-1]]
1268         elif req_format in ('-1', 'all'):
1269             return formats
1270         else:
1271             format = self._specific( req_format, formats )
1272             if result is None:
1273                 raise ExtractorError(u'Requested format not available')
1274             return [format]
1275
1276
1277
1278 class PornotubeIE(InfoExtractor):
1279     """Information extractor for pornotube.com."""
1280     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1281
1282     def _real_extract(self, url):
1283         mobj = re.match(self._VALID_URL, url)
1284         if mobj is None:
1285             raise ExtractorError(u'Invalid URL: %s' % url)
1286
1287         video_id = mobj.group('videoid')
1288         video_title = mobj.group('title')
1289
1290         # Get webpage content
1291         webpage = self._download_webpage(url, video_id)
1292
1293         # Get the video URL
1294         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1295         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1296         video_url = compat_urllib_parse.unquote(video_url)
1297
1298         #Get the uploaded date
1299         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1300         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1301         if upload_date: upload_date = unified_strdate(upload_date)
1302
1303         info = {'id': video_id,
1304                 'url': video_url,
1305                 'uploader': None,
1306                 'upload_date': upload_date,
1307                 'title': video_title,
1308                 'ext': 'flv',
1309                 'format': 'flv'}
1310
1311         return [info]
1312
1313 class YouJizzIE(InfoExtractor):
1314     """Information extractor for youjizz.com."""
1315     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1316
1317     def _real_extract(self, url):
1318         mobj = re.match(self._VALID_URL, url)
1319         if mobj is None:
1320             raise ExtractorError(u'Invalid URL: %s' % url)
1321
1322         video_id = mobj.group('videoid')
1323
1324         # Get webpage content
1325         webpage = self._download_webpage(url, video_id)
1326
1327         # Get the video title
1328         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1329             webpage, u'title').strip()
1330
1331         # Get the embed page
1332         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1333         if result is None:
1334             raise ExtractorError(u'ERROR: unable to extract embed page')
1335
1336         embed_page_url = result.group(0).strip()
1337         video_id = result.group('videoid')
1338
1339         webpage = self._download_webpage(embed_page_url, video_id)
1340
1341         # Get the video URL
1342         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1343             webpage, u'video URL')
1344
1345         info = {'id': video_id,
1346                 'url': video_url,
1347                 'title': video_title,
1348                 'ext': 'flv',
1349                 'format': 'flv',
1350                 'player_url': embed_page_url}
1351
1352         return [info]
1353
1354 class EightTracksIE(InfoExtractor):
1355     IE_NAME = '8tracks'
1356     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1357
1358     def _real_extract(self, url):
1359         mobj = re.match(self._VALID_URL, url)
1360         if mobj is None:
1361             raise ExtractorError(u'Invalid URL: %s' % url)
1362         playlist_id = mobj.group('id')
1363
1364         webpage = self._download_webpage(url, playlist_id)
1365
1366         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1367         data = json.loads(json_like)
1368
1369         session = str(random.randint(0, 1000000000))
1370         mix_id = data['id']
1371         track_count = data['tracks_count']
1372         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1373         next_url = first_url
1374         res = []
1375         for i in itertools.count():
1376             api_json = self._download_webpage(next_url, playlist_id,
1377                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1378                 errnote=u'Failed to download song information')
1379             api_data = json.loads(api_json)
1380             track_data = api_data[u'set']['track']
1381             info = {
1382                 'id': track_data['id'],
1383                 'url': track_data['track_file_stream_url'],
1384                 'title': track_data['performer'] + u' - ' + track_data['name'],
1385                 'raw_title': track_data['name'],
1386                 'uploader_id': data['user']['login'],
1387                 'ext': 'm4a',
1388             }
1389             res.append(info)
1390             if api_data['set']['at_last_track']:
1391                 break
1392             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1393         return res
1394
1395 class KeekIE(InfoExtractor):
1396     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1397     IE_NAME = u'keek'
1398
1399     def _real_extract(self, url):
1400         m = re.match(self._VALID_URL, url)
1401         video_id = m.group('videoID')
1402
1403         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1404         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1405         webpage = self._download_webpage(url, video_id)
1406
1407         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1408             webpage, u'title')
1409
1410         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1411             webpage, u'uploader', fatal=False)
1412
1413         info = {
1414                 'id': video_id,
1415                 'url': video_url,
1416                 'ext': 'mp4',
1417                 'title': video_title,
1418                 'thumbnail': thumbnail,
1419                 'uploader': uploader
1420         }
1421         return [info]
1422
1423 class TEDIE(InfoExtractor):
1424     _VALID_URL=r'''http://www\.ted\.com/
1425                    (
1426                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1427                         |
1428                         ((?P<type_talk>talks)) # We have a simple talk
1429                    )
1430                    (/lang/(.*?))? # The url may contain the language
1431                    /(?P<name>\w+) # Here goes the name and then ".html"
1432                    '''
1433
1434     @classmethod
1435     def suitable(cls, url):
1436         """Receives a URL and returns True if suitable for this IE."""
1437         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1438
1439     def _real_extract(self, url):
1440         m=re.match(self._VALID_URL, url, re.VERBOSE)
1441         if m.group('type_talk'):
1442             return [self._talk_info(url)]
1443         else :
1444             playlist_id=m.group('playlist_id')
1445             name=m.group('name')
1446             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1447             return [self._playlist_videos_info(url,name,playlist_id)]
1448
1449     def _playlist_videos_info(self,url,name,playlist_id=0):
1450         '''Returns the videos of the playlist'''
1451         video_RE=r'''
1452                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1453                      ([.\s]*?)data-playlist_item_id="(\d+)"
1454                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1455                      '''
1456         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1457         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1458         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1459         m_names=re.finditer(video_name_RE,webpage)
1460
1461         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1462                                                  webpage, 'playlist title')
1463
1464         playlist_entries = []
1465         for m_video, m_name in zip(m_videos,m_names):
1466             video_id=m_video.group('video_id')
1467             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1468             playlist_entries.append(self.url_result(talk_url, 'TED'))
1469         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1470
1471     def _talk_info(self, url, video_id=0):
1472         """Return the video for the talk in the url"""
1473         m = re.match(self._VALID_URL, url,re.VERBOSE)
1474         video_name = m.group('name')
1475         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1476         self.report_extraction(video_name)
1477         # If the url includes the language we get the title translated
1478         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1479                                         webpage, 'title')
1480         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1481                                     webpage, 'json data')
1482         info = json.loads(json_data)
1483         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1484                                        webpage, 'description', flags = re.DOTALL)
1485         
1486         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1487                                        webpage, 'thumbnail')
1488         info = {
1489                 'id': info['id'],
1490                 'url': info['htmlStreams'][-1]['file'],
1491                 'ext': 'mp4',
1492                 'title': title,
1493                 'thumbnail': thumbnail,
1494                 'description': desc,
1495                 }
1496         return info
1497
1498 class MySpassIE(InfoExtractor):
1499     _VALID_URL = r'http://www.myspass.de/.*'
1500
1501     def _real_extract(self, url):
1502         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1503
1504         # video id is the last path element of the URL
1505         # usually there is a trailing slash, so also try the second but last
1506         url_path = compat_urllib_parse_urlparse(url).path
1507         url_parent_path, video_id = os.path.split(url_path)
1508         if not video_id:
1509             _, video_id = os.path.split(url_parent_path)
1510
1511         # get metadata
1512         metadata_url = META_DATA_URL_TEMPLATE % video_id
1513         metadata_text = self._download_webpage(metadata_url, video_id)
1514         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1515
1516         # extract values from metadata
1517         url_flv_el = metadata.find('url_flv')
1518         if url_flv_el is None:
1519             raise ExtractorError(u'Unable to extract download url')
1520         video_url = url_flv_el.text
1521         extension = os.path.splitext(video_url)[1][1:]
1522         title_el = metadata.find('title')
1523         if title_el is None:
1524             raise ExtractorError(u'Unable to extract title')
1525         title = title_el.text
1526         format_id_el = metadata.find('format_id')
1527         if format_id_el is None:
1528             format = ext
1529         else:
1530             format = format_id_el.text
1531         description_el = metadata.find('description')
1532         if description_el is not None:
1533             description = description_el.text
1534         else:
1535             description = None
1536         imagePreview_el = metadata.find('imagePreview')
1537         if imagePreview_el is not None:
1538             thumbnail = imagePreview_el.text
1539         else:
1540             thumbnail = None
1541         info = {
1542             'id': video_id,
1543             'url': video_url,
1544             'title': title,
1545             'ext': extension,
1546             'format': format,
1547             'thumbnail': thumbnail,
1548             'description': description
1549         }
1550         return [info]
1551
1552 class SpiegelIE(InfoExtractor):
1553     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1554
1555     def _real_extract(self, url):
1556         m = re.match(self._VALID_URL, url)
1557         video_id = m.group('videoID')
1558
1559         webpage = self._download_webpage(url, video_id)
1560
1561         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1562             webpage, u'title')
1563
1564         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1565         xml_code = self._download_webpage(xml_url, video_id,
1566                     note=u'Downloading XML', errnote=u'Failed to download XML')
1567
1568         idoc = xml.etree.ElementTree.fromstring(xml_code)
1569         last_type = idoc[-1]
1570         filename = last_type.findall('./filename')[0].text
1571         duration = float(last_type.findall('./duration')[0].text)
1572
1573         video_url = 'http://video2.spiegel.de/flash/' + filename
1574         video_ext = filename.rpartition('.')[2]
1575         info = {
1576             'id': video_id,
1577             'url': video_url,
1578             'ext': video_ext,
1579             'title': video_title,
1580             'duration': duration,
1581         }
1582         return [info]
1583
1584 class LiveLeakIE(InfoExtractor):
1585
1586     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1587     IE_NAME = u'liveleak'
1588
1589     def _real_extract(self, url):
1590         mobj = re.match(self._VALID_URL, url)
1591         if mobj is None:
1592             raise ExtractorError(u'Invalid URL: %s' % url)
1593
1594         video_id = mobj.group('video_id')
1595
1596         webpage = self._download_webpage(url, video_id)
1597
1598         video_url = self._search_regex(r'file: "(.*?)",',
1599             webpage, u'video URL')
1600
1601         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1602             webpage, u'title').replace('LiveLeak.com -', '').strip()
1603
1604         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1605             webpage, u'description', fatal=False)
1606
1607         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1608             webpage, u'uploader', fatal=False)
1609
1610         info = {
1611             'id':  video_id,
1612             'url': video_url,
1613             'ext': 'mp4',
1614             'title': video_title,
1615             'description': video_description,
1616             'uploader': video_uploader
1617         }
1618
1619         return [info]
1620
1621
1622
1623 class TumblrIE(InfoExtractor):
1624     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1625
1626     def _real_extract(self, url):
1627         m_url = re.match(self._VALID_URL, url)
1628         video_id = m_url.group('id')
1629         blog = m_url.group('blog_name')
1630
1631         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1632         webpage = self._download_webpage(url, video_id)
1633
1634         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1635         video = re.search(re_video, webpage)
1636         if video is None:
1637            raise ExtractorError(u'Unable to extract video')
1638         video_url = video.group('video_url')
1639         ext = video.group('ext')
1640
1641         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1642             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1643         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1644
1645         # The only place where you can get a title, it's not complete,
1646         # but searching in other places doesn't work for all videos
1647         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1648             webpage, u'title', flags=re.DOTALL)
1649
1650         return [{'id': video_id,
1651                  'url': video_url,
1652                  'title': video_title,
1653                  'thumbnail': video_thumbnail,
1654                  'ext': ext
1655                  }]
1656
1657 class BandcampIE(InfoExtractor):
1658     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1659
1660     def _real_extract(self, url):
1661         mobj = re.match(self._VALID_URL, url)
1662         title = mobj.group('title')
1663         webpage = self._download_webpage(url, title)
1664         # We get the link to the free download page
1665         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1666         if m_download is None:
1667             raise ExtractorError(u'No free songs found')
1668
1669         download_link = m_download.group(1)
1670         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
1671                        webpage, re.MULTILINE|re.DOTALL).group('id')
1672
1673         download_webpage = self._download_webpage(download_link, id,
1674                                                   'Downloading free downloads page')
1675         # We get the dictionary of the track from some javascrip code
1676         info = re.search(r'items: (.*?),$',
1677                          download_webpage, re.MULTILINE).group(1)
1678         info = json.loads(info)[0]
1679         # We pick mp3-320 for now, until format selection can be easily implemented.
1680         mp3_info = info[u'downloads'][u'mp3-320']
1681         # If we try to use this url it says the link has expired
1682         initial_url = mp3_info[u'url']
1683         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1684         m_url = re.match(re_url, initial_url)
1685         #We build the url we will use to get the final track url
1686         # This url is build in Bandcamp in the script download_bunde_*.js
1687         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1688         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1689         # If we could correctly generate the .rand field the url would be
1690         #in the "download_url" key
1691         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1692
1693         track_info = {'id':id,
1694                       'title' : info[u'title'],
1695                       'ext' :   'mp3',
1696                       'url' :   final_url,
1697                       'thumbnail' : info[u'thumb_url'],
1698                       'uploader' :  info[u'artist']
1699                       }
1700
1701         return [track_info]
1702
1703 class RedTubeIE(InfoExtractor):
1704     """Information Extractor for redtube"""
1705     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1706
1707     def _real_extract(self,url):
1708         mobj = re.match(self._VALID_URL, url)
1709         if mobj is None:
1710             raise ExtractorError(u'Invalid URL: %s' % url)
1711
1712         video_id = mobj.group('id')
1713         video_extension = 'mp4'        
1714         webpage = self._download_webpage(url, video_id)
1715
1716         self.report_extraction(video_id)
1717
1718         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1719             webpage, u'video URL')
1720
1721         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1722             webpage, u'title')
1723
1724         return [{
1725             'id':       video_id,
1726             'url':      video_url,
1727             'ext':      video_extension,
1728             'title':    video_title,
1729         }]
1730         
1731 class InaIE(InfoExtractor):
1732     """Information Extractor for Ina.fr"""
1733     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1734
1735     def _real_extract(self,url):
1736         mobj = re.match(self._VALID_URL, url)
1737
1738         video_id = mobj.group('id')
1739         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1740         video_extension = 'mp4'
1741         webpage = self._download_webpage(mrss_url, video_id)
1742
1743         self.report_extraction(video_id)
1744
1745         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1746             webpage, u'video URL')
1747
1748         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1749             webpage, u'title')
1750
1751         return [{
1752             'id':       video_id,
1753             'url':      video_url,
1754             'ext':      video_extension,
1755             'title':    video_title,
1756         }]
1757
1758 class HowcastIE(InfoExtractor):
1759     """Information Extractor for Howcast.com"""
1760     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1761
1762     def _real_extract(self, url):
1763         mobj = re.match(self._VALID_URL, url)
1764
1765         video_id = mobj.group('id')
1766         webpage_url = 'http://www.howcast.com/videos/' + video_id
1767         webpage = self._download_webpage(webpage_url, video_id)
1768
1769         self.report_extraction(video_id)
1770
1771         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1772             webpage, u'video URL')
1773
1774         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1775             webpage, u'title')
1776
1777         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1778             webpage, u'description', fatal=False)
1779
1780         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1781             webpage, u'thumbnail', fatal=False)
1782
1783         return [{
1784             'id':       video_id,
1785             'url':      video_url,
1786             'ext':      'mp4',
1787             'title':    video_title,
1788             'description': video_description,
1789             'thumbnail': thumbnail,
1790         }]
1791
1792 class VineIE(InfoExtractor):
1793     """Information Extractor for Vine.co"""
1794     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1795
1796     def _real_extract(self, url):
1797         mobj = re.match(self._VALID_URL, url)
1798
1799         video_id = mobj.group('id')
1800         webpage_url = 'https://vine.co/v/' + video_id
1801         webpage = self._download_webpage(webpage_url, video_id)
1802
1803         self.report_extraction(video_id)
1804
1805         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1806             webpage, u'video URL')
1807
1808         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1809             webpage, u'title')
1810
1811         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1812             webpage, u'thumbnail', fatal=False)
1813
1814         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1815             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1816
1817         return [{
1818             'id':        video_id,
1819             'url':       video_url,
1820             'ext':       'mp4',
1821             'title':     video_title,
1822             'thumbnail': thumbnail,
1823             'uploader':  uploader,
1824         }]
1825
1826 class FlickrIE(InfoExtractor):
1827     """Information Extractor for Flickr videos"""
1828     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1829
1830     def _real_extract(self, url):
1831         mobj = re.match(self._VALID_URL, url)
1832
1833         video_id = mobj.group('id')
1834         video_uploader_id = mobj.group('uploader_id')
1835         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1836         webpage = self._download_webpage(webpage_url, video_id)
1837
1838         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1839
1840         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1841         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1842
1843         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1844             first_xml, u'node_id')
1845
1846         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1847         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1848
1849         self.report_extraction(video_id)
1850
1851         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1852         if mobj is None:
1853             raise ExtractorError(u'Unable to extract video url')
1854         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1855
1856         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1857             webpage, u'video title')
1858
1859         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1860             webpage, u'description', fatal=False)
1861
1862         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1863             webpage, u'thumbnail', fatal=False)
1864
1865         return [{
1866             'id':          video_id,
1867             'url':         video_url,
1868             'ext':         'mp4',
1869             'title':       video_title,
1870             'description': video_description,
1871             'thumbnail':   thumbnail,
1872             'uploader_id': video_uploader_id,
1873         }]
1874
1875 class TeamcocoIE(InfoExtractor):
1876     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1877
1878     def _real_extract(self, url):
1879         mobj = re.match(self._VALID_URL, url)
1880         if mobj is None:
1881             raise ExtractorError(u'Invalid URL: %s' % url)
1882         url_title = mobj.group('url_title')
1883         webpage = self._download_webpage(url, url_title)
1884
1885         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1886             webpage, u'video id')
1887
1888         self.report_extraction(video_id)
1889
1890         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1891             webpage, u'title')
1892
1893         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1894             webpage, u'thumbnail', fatal=False)
1895
1896         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1897             webpage, u'description', fatal=False)
1898
1899         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1900         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1901
1902         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1903             data, u'video URL')
1904
1905         return [{
1906             'id':          video_id,
1907             'url':         video_url,
1908             'ext':         'mp4',
1909             'title':       video_title,
1910             'thumbnail':   thumbnail,
1911             'description': video_description,
1912         }]
1913
1914 class XHamsterIE(InfoExtractor):
1915     """Information Extractor for xHamster"""
1916     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1917
1918     def _real_extract(self,url):
1919         mobj = re.match(self._VALID_URL, url)
1920
1921         video_id = mobj.group('id')
1922         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1923         webpage = self._download_webpage(mrss_url, video_id)
1924
1925         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1926         if mobj is None:
1927             raise ExtractorError(u'Unable to extract media URL')
1928         if len(mobj.group('server')) == 0:
1929             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1930         else:
1931             video_url = mobj.group('server')+'/key='+mobj.group('file')
1932         video_extension = video_url.split('.')[-1]
1933
1934         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1935             webpage, u'title')
1936
1937         # Can't see the description anywhere in the UI
1938         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1939         #     webpage, u'description', fatal=False)
1940         # if video_description: video_description = unescapeHTML(video_description)
1941
1942         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1943         if mobj:
1944             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1945         else:
1946             video_upload_date = None
1947             self._downloader.report_warning(u'Unable to extract upload date')
1948
1949         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1950             webpage, u'uploader id', default=u'anonymous')
1951
1952         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1953             webpage, u'thumbnail', fatal=False)
1954
1955         return [{
1956             'id':       video_id,
1957             'url':      video_url,
1958             'ext':      video_extension,
1959             'title':    video_title,
1960             # 'description': video_description,
1961             'upload_date': video_upload_date,
1962             'uploader_id': video_uploader_id,
1963             'thumbnail': video_thumbnail
1964         }]
1965
1966 class HypemIE(InfoExtractor):
1967     """Information Extractor for hypem"""
1968     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1969
1970     def _real_extract(self, url):
1971         mobj = re.match(self._VALID_URL, url)
1972         if mobj is None:
1973             raise ExtractorError(u'Invalid URL: %s' % url)
1974         track_id = mobj.group(1)
1975
1976         data = { 'ax': 1, 'ts': time.time() }
1977         data_encoded = compat_urllib_parse.urlencode(data)
1978         complete_url = url + "?" + data_encoded
1979         request = compat_urllib_request.Request(complete_url)
1980         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1981         cookie = urlh.headers.get('Set-Cookie', '')
1982
1983         self.report_extraction(track_id)
1984
1985         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1986             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1987         try:
1988             track_list = json.loads(html_tracks)
1989             track = track_list[u'tracks'][0]
1990         except ValueError:
1991             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1992
1993         key = track[u"key"]
1994         track_id = track[u"id"]
1995         artist = track[u"artist"]
1996         title = track[u"song"]
1997
1998         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1999         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2000         request.add_header('cookie', cookie)
2001         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2002         try:
2003             song_data = json.loads(song_data_json)
2004         except ValueError:
2005             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2006         final_url = song_data[u"url"]
2007
2008         return [{
2009             'id':       track_id,
2010             'url':      final_url,
2011             'ext':      "mp3",
2012             'title':    title,
2013             'artist':   artist,
2014         }]
2015
2016 class Vbox7IE(InfoExtractor):
2017     """Information Extractor for Vbox7"""
2018     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2019
2020     def _real_extract(self,url):
2021         mobj = re.match(self._VALID_URL, url)
2022         if mobj is None:
2023             raise ExtractorError(u'Invalid URL: %s' % url)
2024         video_id = mobj.group(1)
2025
2026         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2027         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2028         redirect_url = urlh.geturl() + new_location
2029         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2030
2031         title = self._html_search_regex(r'<title>(.*)</title>',
2032             webpage, u'title').split('/')[0].strip()
2033
2034         ext = "flv"
2035         info_url = "http://vbox7.com/play/magare.do"
2036         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2037         info_request = compat_urllib_request.Request(info_url, data)
2038         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2039         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2040         if info_response is None:
2041             raise ExtractorError(u'Unable to extract the media url')
2042         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2043
2044         return [{
2045             'id':        video_id,
2046             'url':       final_url,
2047             'ext':       ext,
2048             'title':     title,
2049             'thumbnail': thumbnail_url,
2050         }]
2051
2052
2053 def gen_extractors():
2054     """ Return a list of an instance of every supported extractor.
2055     The order does matter; the first extractor matched is the one handling the URL.
2056     """
2057     return [
2058         YoutubePlaylistIE(),
2059         YoutubeChannelIE(),
2060         YoutubeUserIE(),
2061         YoutubeSearchIE(),
2062         YoutubeIE(),
2063         MetacafeIE(),
2064         DailymotionIE(),
2065         GoogleSearchIE(),
2066         PhotobucketIE(),
2067         YahooIE(),
2068         YahooSearchIE(),
2069         DepositFilesIE(),
2070         FacebookIE(),
2071         BlipTVIE(),
2072         BlipTVUserIE(),
2073         VimeoIE(),
2074         MyVideoIE(),
2075         ComedyCentralIE(),
2076         EscapistIE(),
2077         CollegeHumorIE(),
2078         XVideosIE(),
2079         SoundcloudSetIE(),
2080         SoundcloudIE(),
2081         InfoQIE(),
2082         MixcloudIE(),
2083         StanfordOpenClassroomIE(),
2084         MTVIE(),
2085         YoukuIE(),
2086         XNXXIE(),
2087         YouJizzIE(),
2088         PornotubeIE(),
2089         YouPornIE(),
2090         GooglePlusIE(),
2091         ArteTvIE(),
2092         NBAIE(),
2093         WorldStarHipHopIE(),
2094         JustinTVIE(),
2095         FunnyOrDieIE(),
2096         SteamIE(),
2097         UstreamIE(),
2098         RBMARadioIE(),
2099         EightTracksIE(),
2100         KeekIE(),
2101         TEDIE(),
2102         MySpassIE(),
2103         SpiegelIE(),
2104         LiveLeakIE(),
2105         ARDIE(),
2106         ZDFIE(),
2107         TumblrIE(),
2108         BandcampIE(),
2109         RedTubeIE(),
2110         InaIE(),
2111         HowcastIE(),
2112         VineIE(),
2113         FlickrIE(),
2114         TeamcocoIE(),
2115         XHamsterIE(),
2116         HypemIE(),
2117         Vbox7IE(),
2118         GametrailersIE(),
2119         StatigramIE(),
2120         GenericIE()
2121     ]
2122
2123 def get_info_extractor(ie_name):
2124     """Returns the info extractor class with the given ie_name"""
2125     return globals()[ie_name+'IE']