2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _extract_subtitles(self, video_id):
232 self.report_video_subtitles_download(video_id)
233 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
235 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
237 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
238 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
239 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
240 if not srt_lang_list:
241 return (u'WARNING: video has no closed captions', None)
242 if self._downloader.params.get('subtitleslang', False):
243 srt_lang = self._downloader.params.get('subtitleslang')
244 elif 'en' in srt_lang_list:
247 srt_lang = list(srt_lang_list.keys())[0]
248 if not srt_lang in srt_lang_list:
249 return (u'WARNING: no closed captions found in the specified language', None)
250 params = compat_urllib_parse.urlencode({
252 'name': srt_lang_list[srt_lang].encode('utf-8'),
256 url = 'http://www.youtube.com/api/timedtext?' + params
258 srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
259 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
260 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
262 return (u'WARNING: Did not fetch video subtitles', None)
265 def _print_formats(self, formats):
266 print('Available formats:')
268 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
270 def _real_initialize(self):
271 if self._downloader is None:
276 downloader_params = self._downloader.params
278 # Attempt to use provided username and password or .netrc data
279 if downloader_params.get('username', None) is not None:
280 username = downloader_params['username']
281 password = downloader_params['password']
282 elif downloader_params.get('usenetrc', False):
284 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
289 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
290 except (IOError, netrc.NetrcParseError) as err:
291 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
295 request = compat_urllib_request.Request(self._LANG_URL)
298 compat_urllib_request.urlopen(request).read()
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
303 # No authentication to be performed
307 request = compat_urllib_request.Request(self._LOGIN_URL)
309 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
311 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
316 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
318 galx = match.group(1)
320 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
326 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
330 u'PersistentCookie': u'yes',
332 u'bgresponse': u'js_disabled',
333 u'checkConnection': u'',
334 u'checkedDomains': u'youtube',
340 u'signIn': u'Sign in',
342 u'service': u'youtube',
346 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
348 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
349 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
350 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
353 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
354 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
355 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
357 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
358 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
364 'action_confirm': 'Confirm',
366 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
368 self.report_age_confirmation()
369 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
374 def _extract_id(self, url):
375 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
377 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
379 video_id = mobj.group(2)
382 def _real_extract(self, url):
383 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
384 mobj = re.search(self._NEXT_URL_RE, url)
386 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
387 video_id = self._extract_id(url)
390 self.report_video_webpage_download(video_id)
391 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
392 request = compat_urllib_request.Request(url)
394 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
399 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
401 # Attempt to extract SWF player URL
402 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
404 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
409 self.report_video_info_webpage_download(video_id)
410 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
411 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
412 % (video_id, el_type))
413 request = compat_urllib_request.Request(video_info_url)
415 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
416 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
417 video_info = compat_parse_qs(video_info_webpage)
418 if 'token' in video_info:
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
423 if 'token' not in video_info:
424 if 'reason' in video_info:
425 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
427 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
430 # Check for "rental" videos
431 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
432 self._downloader.trouble(u'ERROR: "rental" videos not supported')
435 # Start extracting information
436 self.report_information_extraction(video_id)
439 if 'author' not in video_info:
440 self._downloader.trouble(u'ERROR: unable to extract uploader name')
442 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
445 video_uploader_id = None
446 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
448 video_uploader_id = mobj.group(1)
450 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
453 if 'title' not in video_info:
454 self._downloader.trouble(u'ERROR: unable to extract video title')
456 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
459 if 'thumbnail_url' not in video_info:
460 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
462 else: # don't panic if we can't find it
463 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
467 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
469 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
470 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
471 for expression in format_expressions:
473 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
478 video_description = get_element_by_id("eow-description", video_webpage)
479 if video_description:
480 video_description = clean_html(video_description)
482 video_description = ''
485 video_subtitles = None
486 if self._downloader.params.get('writesubtitles', False):
487 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
489 self._downloader.trouble(srt_error)
491 if 'length_seconds' not in video_info:
492 self._downloader.trouble(u'WARNING: unable to extract video duration')
495 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
498 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
500 # Decide which formats to download
501 req_format = self._downloader.params.get('format', None)
503 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
504 self.report_rtmp_download()
505 video_url_list = [(None, video_info['conn'][0])]
506 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
507 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
508 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
509 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
510 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
512 format_limit = self._downloader.params.get('format_limit', None)
513 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
514 if format_limit is not None and format_limit in available_formats:
515 format_list = available_formats[available_formats.index(format_limit):]
517 format_list = available_formats
518 existing_formats = [x for x in format_list if x in url_map]
519 if len(existing_formats) == 0:
520 self._downloader.trouble(u'ERROR: no known formats available for video')
522 if self._downloader.params.get('listformats', None):
523 self._print_formats(existing_formats)
525 if req_format is None or req_format == 'best':
526 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
527 elif req_format == 'worst':
528 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
529 elif req_format in ('-1', 'all'):
530 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
532 # Specific formats. We pick the first in a slash-delimeted sequence.
533 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
534 req_formats = req_format.split('/')
535 video_url_list = None
536 for rf in req_formats:
538 video_url_list = [(rf, url_map[rf])]
540 if video_url_list is None:
541 self._downloader.trouble(u'ERROR: requested format not available')
544 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
548 for format_param, video_real_url in video_url_list:
550 video_extension = self._video_extensions.get(format_param, 'flv')
552 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
553 self._video_dimensions.get(format_param, '???'))
557 'url': video_real_url,
558 'uploader': video_uploader,
559 'uploader_id': video_uploader_id,
560 'upload_date': upload_date,
561 'title': video_title,
562 'ext': video_extension,
563 'format': video_format,
564 'thumbnail': video_thumbnail,
565 'description': video_description,
566 'player_url': player_url,
567 'subtitles': video_subtitles,
568 'duration': video_duration
573 class MetacafeIE(InfoExtractor):
574 """Information Extractor for metacafe.com."""
576 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
577 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
578 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
579 IE_NAME = u'metacafe'
581 def __init__(self, downloader=None):
582 InfoExtractor.__init__(self, downloader)
584 def report_disclaimer(self):
585 """Report disclaimer retrieval."""
586 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
588 def report_age_confirmation(self):
589 """Report attempt to confirm age."""
590 self._downloader.to_screen(u'[metacafe] Confirming age')
592 def report_download_webpage(self, video_id):
593 """Report webpage download."""
594 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
596 def report_extraction(self, video_id):
597 """Report information extraction."""
598 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
600 def _real_initialize(self):
601 # Retrieve disclaimer
602 request = compat_urllib_request.Request(self._DISCLAIMER)
604 self.report_disclaimer()
605 disclaimer = compat_urllib_request.urlopen(request).read()
606 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
607 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
613 'submit': "Continue - I'm over 18",
615 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
617 self.report_age_confirmation()
618 disclaimer = compat_urllib_request.urlopen(request).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
623 def _real_extract(self, url):
624 # Extract id and simplified title from URL
625 mobj = re.match(self._VALID_URL, url)
627 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
630 video_id = mobj.group(1)
632 # Check if video comes from YouTube
633 mobj2 = re.match(r'^yt-(.*)$', video_id)
634 if mobj2 is not None:
635 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
638 # Retrieve video webpage to extract further information
639 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
641 self.report_download_webpage(video_id)
642 webpage = compat_urllib_request.urlopen(request).read()
643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
644 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
647 # Extract URL, uploader and title from webpage
648 self.report_extraction(video_id)
649 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
651 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
652 video_extension = mediaURL[-3:]
654 # Extract gdaKey if available
655 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
659 gdaKey = mobj.group(1)
660 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
662 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
664 self._downloader.trouble(u'ERROR: unable to extract media URL')
666 vardict = compat_parse_qs(mobj.group(1))
667 if 'mediaData' not in vardict:
668 self._downloader.trouble(u'ERROR: unable to extract media URL')
670 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
672 self._downloader.trouble(u'ERROR: unable to extract media URL')
674 mediaURL = mobj.group(1).replace('\\/', '/')
675 video_extension = mediaURL[-3:]
676 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
678 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract title')
682 video_title = mobj.group(1).decode('utf-8')
684 mobj = re.search(r'submitter=(.*?);', webpage)
686 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
688 video_uploader = mobj.group(1)
691 'id': video_id.decode('utf-8'),
692 'url': video_url.decode('utf-8'),
693 'uploader': video_uploader.decode('utf-8'),
695 'title': video_title,
696 'ext': video_extension.decode('utf-8'),
700 class DailymotionIE(InfoExtractor):
701 """Information Extractor for Dailymotion"""
703 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
704 IE_NAME = u'dailymotion'
707 def __init__(self, downloader=None):
708 InfoExtractor.__init__(self, downloader)
710 def report_extraction(self, video_id):
711 """Report information extraction."""
712 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
714 def _real_extract(self, url):
715 # Extract id and simplified title from URL
716 mobj = re.match(self._VALID_URL, url)
718 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
721 video_id = mobj.group(1).split('_')[0].split('?')[0]
723 video_extension = 'mp4'
725 # Retrieve video webpage to extract further information
726 request = compat_urllib_request.Request(url)
727 request.add_header('Cookie', 'family_filter=off')
728 webpage = self._download_webpage(request, video_id)
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
734 self._downloader.trouble(u'ERROR: unable to extract media URL')
736 flashvars = compat_urllib_parse.unquote(mobj.group(1))
738 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
741 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
744 self._downloader.trouble(u'ERROR: unable to extract video URL')
747 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
749 self._downloader.trouble(u'ERROR: unable to extract video URL')
752 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
754 # TODO: support choosing qualities
756 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
758 self._downloader.trouble(u'ERROR: unable to extract title')
760 video_title = unescapeHTML(mobj.group('title'))
762 video_uploader = None
763 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
765 # lookin for official user
766 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
767 if mobj_official is None:
768 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
770 video_uploader = mobj_official.group(1)
772 video_uploader = mobj.group(1)
774 video_upload_date = None
775 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
777 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
782 'uploader': video_uploader,
783 'upload_date': video_upload_date,
784 'title': video_title,
785 'ext': video_extension,
789 class PhotobucketIE(InfoExtractor):
790 """Information extractor for photobucket.com."""
792 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
793 IE_NAME = u'photobucket'
795 def __init__(self, downloader=None):
796 InfoExtractor.__init__(self, downloader)
798 def report_download_webpage(self, video_id):
799 """Report webpage download."""
800 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
802 def report_extraction(self, video_id):
803 """Report information extraction."""
804 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
806 def _real_extract(self, url):
807 # Extract id from URL
808 mobj = re.match(self._VALID_URL, url)
810 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
813 video_id = mobj.group(1)
815 video_extension = 'flv'
817 # Retrieve video webpage to extract further information
818 request = compat_urllib_request.Request(url)
820 self.report_download_webpage(video_id)
821 webpage = compat_urllib_request.urlopen(request).read()
822 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
823 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
826 # Extract URL, uploader, and title from webpage
827 self.report_extraction(video_id)
828 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
830 self._downloader.trouble(u'ERROR: unable to extract media URL')
832 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
836 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
838 self._downloader.trouble(u'ERROR: unable to extract title')
840 video_title = mobj.group(1).decode('utf-8')
842 video_uploader = mobj.group(2).decode('utf-8')
845 'id': video_id.decode('utf-8'),
846 'url': video_url.decode('utf-8'),
847 'uploader': video_uploader,
849 'title': video_title,
850 'ext': video_extension.decode('utf-8'),
854 class YahooIE(InfoExtractor):
855 """Information extractor for video.yahoo.com."""
858 # _VALID_URL matches all Yahoo! Video URLs
859 # _VPAGE_URL matches only the extractable '/watch/' URLs
860 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
861 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
862 IE_NAME = u'video.yahoo'
864 def __init__(self, downloader=None):
865 InfoExtractor.__init__(self, downloader)
867 def report_download_webpage(self, video_id):
868 """Report webpage download."""
869 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
871 def report_extraction(self, video_id):
872 """Report information extraction."""
873 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
875 def _real_extract(self, url, new_video=True):
876 # Extract ID from URL
877 mobj = re.match(self._VALID_URL, url)
879 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
882 video_id = mobj.group(2)
883 video_extension = 'flv'
885 # Rewrite valid but non-extractable URLs as
886 # extractable English language /watch/ URLs
887 if re.match(self._VPAGE_URL, url) is None:
888 request = compat_urllib_request.Request(url)
890 webpage = compat_urllib_request.urlopen(request).read()
891 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
892 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
895 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
897 self._downloader.trouble(u'ERROR: Unable to extract id field')
899 yahoo_id = mobj.group(1)
901 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
903 self._downloader.trouble(u'ERROR: Unable to extract vid field')
905 yahoo_vid = mobj.group(1)
907 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
908 return self._real_extract(url, new_video=False)
910 # Retrieve video webpage to extract further information
911 request = compat_urllib_request.Request(url)
913 self.report_download_webpage(video_id)
914 webpage = compat_urllib_request.urlopen(request).read()
915 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
916 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
919 # Extract uploader and title from webpage
920 self.report_extraction(video_id)
921 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
923 self._downloader.trouble(u'ERROR: unable to extract video title')
925 video_title = mobj.group(1).decode('utf-8')
927 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
929 self._downloader.trouble(u'ERROR: unable to extract video uploader')
931 video_uploader = mobj.group(1).decode('utf-8')
933 # Extract video thumbnail
934 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
936 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
938 video_thumbnail = mobj.group(1).decode('utf-8')
940 # Extract video description
941 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
943 self._downloader.trouble(u'ERROR: unable to extract video description')
945 video_description = mobj.group(1).decode('utf-8')
946 if not video_description:
947 video_description = 'No description available.'
949 # Extract video height and width
950 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
952 self._downloader.trouble(u'ERROR: unable to extract video height')
954 yv_video_height = mobj.group(1)
956 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
958 self._downloader.trouble(u'ERROR: unable to extract video width')
960 yv_video_width = mobj.group(1)
962 # Retrieve video playlist to extract media URL
963 # I'm not completely sure what all these options are, but we
964 # seem to need most of them, otherwise the server sends a 401.
965 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
966 yv_bitrate = '700' # according to Wikipedia this is hard-coded
967 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
968 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
969 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
971 self.report_download_webpage(video_id)
972 webpage = compat_urllib_request.urlopen(request).read()
973 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
974 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
977 # Extract media URL from playlist XML
978 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
980 self._downloader.trouble(u'ERROR: Unable to extract media URL')
982 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
983 video_url = unescapeHTML(video_url)
986 'id': video_id.decode('utf-8'),
988 'uploader': video_uploader,
990 'title': video_title,
991 'ext': video_extension.decode('utf-8'),
992 'thumbnail': video_thumbnail.decode('utf-8'),
993 'description': video_description,
997 class VimeoIE(InfoExtractor):
998 """Information extractor for vimeo.com."""
1000 # _VALID_URL matches Vimeo URLs
1001 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1004 def __init__(self, downloader=None):
1005 InfoExtractor.__init__(self, downloader)
1007 def report_download_webpage(self, video_id):
1008 """Report webpage download."""
1009 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1011 def report_extraction(self, video_id):
1012 """Report information extraction."""
1013 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1015 def _real_extract(self, url, new_video=True):
1016 # Extract ID from URL
1017 mobj = re.match(self._VALID_URL, url)
1019 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1022 video_id = mobj.group('id')
1023 if not mobj.group('proto'):
1024 url = 'https://' + url
1025 if mobj.group('direct_link'):
1026 url = 'https://vimeo.com/' + video_id
1028 # Retrieve video webpage to extract further information
1029 request = compat_urllib_request.Request(url, None, std_headers)
1031 self.report_download_webpage(video_id)
1032 webpage_bytes = compat_urllib_request.urlopen(request).read()
1033 webpage = webpage_bytes.decode('utf-8')
1034 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1035 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1038 # Now we begin extracting as much information as we can from what we
1039 # retrieved. First we extract the information common to all extractors,
1040 # and latter we extract those that are Vimeo specific.
1041 self.report_extraction(video_id)
1043 # Extract the config JSON
1045 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1046 config = json.loads(config)
1048 self._downloader.trouble(u'ERROR: unable to extract info section')
1052 video_title = config["video"]["title"]
1054 # Extract uploader and uploader_id
1055 video_uploader = config["video"]["owner"]["name"]
1056 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1058 # Extract video thumbnail
1059 video_thumbnail = config["video"]["thumbnail"]
1061 # Extract video description
1062 video_description = get_element_by_attribute("itemprop", "description", webpage)
1063 if video_description: video_description = clean_html(video_description)
1064 else: video_description = ''
1066 # Extract upload date
1067 video_upload_date = None
1068 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1069 if mobj is not None:
1070 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1072 # Vimeo specific: extract request signature and timestamp
1073 sig = config['request']['signature']
1074 timestamp = config['request']['timestamp']
1076 # Vimeo specific: extract video codec and quality information
1077 # First consider quality, then codecs, then take everything
1078 # TODO bind to format param
1079 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1080 files = { 'hd': [], 'sd': [], 'other': []}
1081 for codec_name, codec_extension in codecs:
1082 if codec_name in config["video"]["files"]:
1083 if 'hd' in config["video"]["files"][codec_name]:
1084 files['hd'].append((codec_name, codec_extension, 'hd'))
1085 elif 'sd' in config["video"]["files"][codec_name]:
1086 files['sd'].append((codec_name, codec_extension, 'sd'))
1088 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1090 for quality in ('hd', 'sd', 'other'):
1091 if len(files[quality]) > 0:
1092 video_quality = files[quality][0][2]
1093 video_codec = files[quality][0][0]
1094 video_extension = files[quality][0][1]
1095 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1098 self._downloader.trouble(u'ERROR: no known codec found')
1101 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1102 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1107 'uploader': video_uploader,
1108 'uploader_id': video_uploader_id,
1109 'upload_date': video_upload_date,
1110 'title': video_title,
1111 'ext': video_extension,
1112 'thumbnail': video_thumbnail,
1113 'description': video_description,
1117 class ArteTvIE(InfoExtractor):
1118 """arte.tv information extractor."""
1120 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1121 _LIVE_URL = r'index-[0-9]+\.html$'
1123 IE_NAME = u'arte.tv'
1125 def __init__(self, downloader=None):
1126 InfoExtractor.__init__(self, downloader)
1128 def report_download_webpage(self, video_id):
1129 """Report webpage download."""
1130 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1132 def report_extraction(self, video_id):
1133 """Report information extraction."""
1134 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1136 def fetch_webpage(self, url):
1137 request = compat_urllib_request.Request(url)
1139 self.report_download_webpage(url)
1140 webpage = compat_urllib_request.urlopen(request).read()
1141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1142 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1144 except ValueError as err:
1145 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1149 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1150 page = self.fetch_webpage(url)
1151 mobj = re.search(regex, page, regexFlags)
1155 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1158 for (i, key, err) in matchTuples:
1159 if mobj.group(i) is None:
1160 self._downloader.trouble(err)
1163 info[key] = mobj.group(i)
1167 def extractLiveStream(self, url):
1168 video_lang = url.split('/')[-4]
1169 info = self.grep_webpage(
1171 r'src="(.*?/videothek_js.*?\.js)',
1174 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1177 http_host = url.split('/')[2]
1178 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1179 info = self.grep_webpage(
1181 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1182 '(http://.*?\.swf).*?' +
1186 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1187 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1188 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1191 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1193 def extractPlus7Stream(self, url):
1194 video_lang = url.split('/')[-3]
1195 info = self.grep_webpage(
1197 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1200 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1203 next_url = compat_urllib_parse.unquote(info.get('url'))
1204 info = self.grep_webpage(
1206 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1209 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1212 next_url = compat_urllib_parse.unquote(info.get('url'))
1214 info = self.grep_webpage(
1216 r'<video id="(.*?)".*?>.*?' +
1217 '<name>(.*?)</name>.*?' +
1218 '<dateVideo>(.*?)</dateVideo>.*?' +
1219 '<url quality="hd">(.*?)</url>',
1222 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1223 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1224 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1225 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1230 'id': info.get('id'),
1231 'url': compat_urllib_parse.unquote(info.get('url')),
1232 'uploader': u'arte.tv',
1233 'upload_date': info.get('date'),
1234 'title': info.get('title').decode('utf-8'),
1240 def _real_extract(self, url):
1241 video_id = url.split('/')[-1]
1242 self.report_extraction(video_id)
1244 if re.search(self._LIVE_URL, video_id) is not None:
1245 self.extractLiveStream(url)
1248 info = self.extractPlus7Stream(url)
1253 class GenericIE(InfoExtractor):
1254 """Generic last-resort information extractor."""
1257 IE_NAME = u'generic'
1259 def __init__(self, downloader=None):
1260 InfoExtractor.__init__(self, downloader)
1262 def report_download_webpage(self, video_id):
1263 """Report webpage download."""
1264 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1265 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1267 def report_extraction(self, video_id):
1268 """Report information extraction."""
1269 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1271 def report_following_redirect(self, new_url):
1272 """Report information extraction."""
1273 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1275 def _test_redirect(self, url):
1276 """Check if it is a redirect, like url shorteners, in case restart chain."""
1277 class HeadRequest(compat_urllib_request.Request):
1278 def get_method(self):
1281 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1283 Subclass the HTTPRedirectHandler to make it use our
1284 HeadRequest also on the redirected URL
1286 def redirect_request(self, req, fp, code, msg, headers, newurl):
1287 if code in (301, 302, 303, 307):
1288 newurl = newurl.replace(' ', '%20')
1289 newheaders = dict((k,v) for k,v in req.headers.items()
1290 if k.lower() not in ("content-length", "content-type"))
1291 return HeadRequest(newurl,
1293 origin_req_host=req.get_origin_req_host(),
1296 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1298 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1300 Fallback to GET if HEAD is not allowed (405 HTTP error)
1302 def http_error_405(self, req, fp, code, msg, headers):
1306 newheaders = dict((k,v) for k,v in req.headers.items()
1307 if k.lower() not in ("content-length", "content-type"))
1308 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1310 origin_req_host=req.get_origin_req_host(),
1314 opener = compat_urllib_request.OpenerDirector()
1315 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1316 HTTPMethodFallback, HEADRedirectHandler,
1317 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1318 opener.add_handler(handler())
1320 response = opener.open(HeadRequest(url))
1321 new_url = response.geturl()
1326 self.report_following_redirect(new_url)
1327 self._downloader.download([new_url])
1330 def _real_extract(self, url):
1331 if self._test_redirect(url): return
1333 video_id = url.split('/')[-1]
1334 request = compat_urllib_request.Request(url)
1336 self.report_download_webpage(video_id)
1337 webpage = compat_urllib_request.urlopen(request).read()
1338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1339 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1341 except ValueError as err:
1342 # since this is the last-resort InfoExtractor, if
1343 # this error is thrown, it'll be thrown here
1344 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1347 self.report_extraction(video_id)
1348 # Start with something easy: JW Player in SWFObject
1349 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1351 # Broaden the search a little bit
1352 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1354 # Broaden the search a little bit: JWPlayer JS loader
1355 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1357 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1360 # It's possible that one of the regexes
1361 # matched, but returned an empty group:
1362 if mobj.group(1) is None:
1363 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1366 video_url = compat_urllib_parse.unquote(mobj.group(1))
1367 video_id = os.path.basename(video_url)
1369 # here's a fun little line of code for you:
1370 video_extension = os.path.splitext(video_id)[1][1:]
1371 video_id = os.path.splitext(video_id)[0]
1373 # it's tempting to parse this further, but you would
1374 # have to take into account all the variations like
1375 # Video Title - Site Name
1376 # Site Name | Video Title
1377 # Video Title - Tagline | Site Name
1378 # and so on and so forth; it's just not practical
1379 mobj = re.search(r'<title>(.*)</title>', webpage)
1381 self._downloader.trouble(u'ERROR: unable to extract title')
1383 video_title = mobj.group(1)
1385 # video uploader is domain name
1386 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1388 self._downloader.trouble(u'ERROR: unable to extract title')
1390 video_uploader = mobj.group(1)
1395 'uploader': video_uploader,
1396 'upload_date': None,
1397 'title': video_title,
1398 'ext': video_extension,
1402 class YoutubeSearchIE(InfoExtractor):
1403 """Information Extractor for YouTube search queries."""
1404 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1405 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1406 _max_youtube_results = 1000
1407 IE_NAME = u'youtube:search'
1409 def __init__(self, downloader=None):
1410 InfoExtractor.__init__(self, downloader)
1412 def report_download_page(self, query, pagenum):
1413 """Report attempt to download search page with given number."""
1414 query = query.decode(preferredencoding())
1415 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1417 def _real_extract(self, query):
1418 mobj = re.match(self._VALID_URL, query)
1420 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1423 prefix, query = query.split(':')
1425 query = query.encode('utf-8')
1427 self._download_n_results(query, 1)
1429 elif prefix == 'all':
1430 self._download_n_results(query, self._max_youtube_results)
1436 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1438 elif n > self._max_youtube_results:
1439 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1440 n = self._max_youtube_results
1441 self._download_n_results(query, n)
1443 except ValueError: # parsing prefix as integer fails
1444 self._download_n_results(query, 1)
1447 def _download_n_results(self, query, n):
1448 """Downloads a specified number of results for a query"""
1454 while (50 * pagenum) < limit:
1455 self.report_download_page(query, pagenum+1)
1456 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1457 request = compat_urllib_request.Request(result_url)
1459 data = compat_urllib_request.urlopen(request).read()
1460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1461 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1463 api_response = json.loads(data)['data']
1465 new_ids = list(video['id'] for video in api_response['items'])
1466 video_ids += new_ids
1468 limit = min(n, api_response['totalItems'])
1471 if len(video_ids) > n:
1472 video_ids = video_ids[:n]
1473 for id in video_ids:
1474 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1478 class GoogleSearchIE(InfoExtractor):
1479 """Information Extractor for Google Video search queries."""
1480 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1481 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1482 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1483 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1484 _max_google_results = 1000
1485 IE_NAME = u'video.google:search'
1487 def __init__(self, downloader=None):
1488 InfoExtractor.__init__(self, downloader)
1490 def report_download_page(self, query, pagenum):
1491 """Report attempt to download playlist page with given number."""
1492 query = query.decode(preferredencoding())
1493 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1495 def _real_extract(self, query):
1496 mobj = re.match(self._VALID_URL, query)
1498 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1501 prefix, query = query.split(':')
1503 query = query.encode('utf-8')
1505 self._download_n_results(query, 1)
1507 elif prefix == 'all':
1508 self._download_n_results(query, self._max_google_results)
1514 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1516 elif n > self._max_google_results:
1517 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1518 n = self._max_google_results
1519 self._download_n_results(query, n)
1521 except ValueError: # parsing prefix as integer fails
1522 self._download_n_results(query, 1)
1525 def _download_n_results(self, query, n):
1526 """Downloads a specified number of results for a query"""
1532 self.report_download_page(query, pagenum)
1533 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1534 request = compat_urllib_request.Request(result_url)
1536 page = compat_urllib_request.urlopen(request).read()
1537 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1538 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1541 # Extract video identifiers
1542 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1543 video_id = mobj.group(1)
1544 if video_id not in video_ids:
1545 video_ids.append(video_id)
1546 if len(video_ids) == n:
1547 # Specified n videos reached
1548 for id in video_ids:
1549 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1552 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1553 for id in video_ids:
1554 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1557 pagenum = pagenum + 1
1560 class YahooSearchIE(InfoExtractor):
1561 """Information Extractor for Yahoo! Video search queries."""
1564 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1565 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1566 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1567 _MORE_PAGES_INDICATOR = r'\s*Next'
1568 _max_yahoo_results = 1000
1569 IE_NAME = u'video.yahoo:search'
1571 def __init__(self, downloader=None):
1572 InfoExtractor.__init__(self, downloader)
1574 def report_download_page(self, query, pagenum):
1575 """Report attempt to download playlist page with given number."""
1576 query = query.decode(preferredencoding())
1577 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1579 def _real_extract(self, query):
1580 mobj = re.match(self._VALID_URL, query)
1582 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1585 prefix, query = query.split(':')
1587 query = query.encode('utf-8')
1589 self._download_n_results(query, 1)
1591 elif prefix == 'all':
1592 self._download_n_results(query, self._max_yahoo_results)
1598 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1600 elif n > self._max_yahoo_results:
1601 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1602 n = self._max_yahoo_results
1603 self._download_n_results(query, n)
1605 except ValueError: # parsing prefix as integer fails
1606 self._download_n_results(query, 1)
1609 def _download_n_results(self, query, n):
1610 """Downloads a specified number of results for a query"""
1613 already_seen = set()
1617 self.report_download_page(query, pagenum)
1618 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1619 request = compat_urllib_request.Request(result_url)
1621 page = compat_urllib_request.urlopen(request).read()
1622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1623 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1626 # Extract video identifiers
1627 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1628 video_id = mobj.group(1)
1629 if video_id not in already_seen:
1630 video_ids.append(video_id)
1631 already_seen.add(video_id)
1632 if len(video_ids) == n:
1633 # Specified n videos reached
1634 for id in video_ids:
1635 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1638 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1639 for id in video_ids:
1640 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1643 pagenum = pagenum + 1
1646 class YoutubePlaylistIE(InfoExtractor):
1647 """Information Extractor for YouTube playlists."""
1649 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1650 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1651 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1652 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1653 IE_NAME = u'youtube:playlist'
1655 def __init__(self, downloader=None):
1656 InfoExtractor.__init__(self, downloader)
1658 def report_download_page(self, playlist_id, pagenum):
1659 """Report attempt to download playlist page with given number."""
1660 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1662 def _real_extract(self, url):
1663 # Extract playlist id
1664 mobj = re.match(self._VALID_URL, url)
1666 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1670 if mobj.group(3) is not None:
1671 self._downloader.download([mobj.group(3)])
1674 # Download playlist pages
1675 # prefix is 'p' as default for playlists but there are other types that need extra care
1676 playlist_prefix = mobj.group(1)
1677 if playlist_prefix == 'a':
1678 playlist_access = 'artist'
1680 playlist_prefix = 'p'
1681 playlist_access = 'view_play_list'
1682 playlist_id = mobj.group(2)
1687 self.report_download_page(playlist_id, pagenum)
1688 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1689 request = compat_urllib_request.Request(url)
1691 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1692 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1693 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1696 # Extract video identifiers
1698 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1699 if mobj.group(1) not in ids_in_page:
1700 ids_in_page.append(mobj.group(1))
1701 video_ids.extend(ids_in_page)
1703 if self._MORE_PAGES_INDICATOR not in page:
1705 pagenum = pagenum + 1
1707 total = len(video_ids)
1709 playliststart = self._downloader.params.get('playliststart', 1) - 1
1710 playlistend = self._downloader.params.get('playlistend', -1)
1711 if playlistend == -1:
1712 video_ids = video_ids[playliststart:]
1714 video_ids = video_ids[playliststart:playlistend]
1716 if len(video_ids) == total:
1717 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1719 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1721 for id in video_ids:
1722 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1726 class YoutubeChannelIE(InfoExtractor):
1727 """Information Extractor for YouTube channels."""
1729 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1730 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1731 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1732 IE_NAME = u'youtube:channel'
1734 def report_download_page(self, channel_id, pagenum):
1735 """Report attempt to download channel page with given number."""
1736 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1738 def _real_extract(self, url):
1739 # Extract channel id
1740 mobj = re.match(self._VALID_URL, url)
1742 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1745 # Download channel pages
1746 channel_id = mobj.group(1)
1751 self.report_download_page(channel_id, pagenum)
1752 url = self._TEMPLATE_URL % (channel_id, pagenum)
1753 request = compat_urllib_request.Request(url)
1755 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1757 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1760 # Extract video identifiers
1762 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1763 if mobj.group(1) not in ids_in_page:
1764 ids_in_page.append(mobj.group(1))
1765 video_ids.extend(ids_in_page)
1767 if self._MORE_PAGES_INDICATOR not in page:
1769 pagenum = pagenum + 1
1771 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1773 for id in video_ids:
1774 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1778 class YoutubeUserIE(InfoExtractor):
1779 """Information Extractor for YouTube users."""
1781 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1782 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1783 _GDATA_PAGE_SIZE = 50
1784 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1785 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1786 IE_NAME = u'youtube:user'
1788 def __init__(self, downloader=None):
1789 InfoExtractor.__init__(self, downloader)
1791 def report_download_page(self, username, start_index):
1792 """Report attempt to download user page."""
1793 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1794 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1796 def _real_extract(self, url):
1798 mobj = re.match(self._VALID_URL, url)
1800 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1803 username = mobj.group(1)
1805 # Download video ids using YouTube Data API. Result size per
1806 # query is limited (currently to 50 videos) so we need to query
1807 # page by page until there are no video ids - it means we got
1814 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1815 self.report_download_page(username, start_index)
1817 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1820 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1821 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1822 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1825 # Extract video identifiers
1828 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1829 if mobj.group(1) not in ids_in_page:
1830 ids_in_page.append(mobj.group(1))
1832 video_ids.extend(ids_in_page)
1834 # A little optimization - if current page is not
1835 # "full", ie. does not contain PAGE_SIZE video ids then
1836 # we can assume that this page is the last one - there
1837 # are no more ids on further pages - no need to query
1840 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1845 all_ids_count = len(video_ids)
1846 playliststart = self._downloader.params.get('playliststart', 1) - 1
1847 playlistend = self._downloader.params.get('playlistend', -1)
1849 if playlistend == -1:
1850 video_ids = video_ids[playliststart:]
1852 video_ids = video_ids[playliststart:playlistend]
1854 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1855 (username, all_ids_count, len(video_ids)))
1857 for video_id in video_ids:
1858 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1861 class BlipTVUserIE(InfoExtractor):
1862 """Information Extractor for blip.tv users."""
1864 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1866 IE_NAME = u'blip.tv:user'
1868 def __init__(self, downloader=None):
1869 InfoExtractor.__init__(self, downloader)
1871 def report_download_page(self, username, pagenum):
1872 """Report attempt to download user page."""
1873 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1874 (self.IE_NAME, username, pagenum))
1876 def _real_extract(self, url):
1878 mobj = re.match(self._VALID_URL, url)
1880 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1883 username = mobj.group(1)
1885 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1887 request = compat_urllib_request.Request(url)
1890 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1891 mobj = re.search(r'data-users-id="([^"]+)"', page)
1892 page_base = page_base % mobj.group(1)
1893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1894 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1898 # Download video ids using BlipTV Ajax calls. Result size per
1899 # query is limited (currently to 12 videos) so we need to query
1900 # page by page until there are no video ids - it means we got
1907 self.report_download_page(username, pagenum)
1909 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1912 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1914 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1917 # Extract video identifiers
1920 for mobj in re.finditer(r'href="/([^"]+)"', page):
1921 if mobj.group(1) not in ids_in_page:
1922 ids_in_page.append(unescapeHTML(mobj.group(1)))
1924 video_ids.extend(ids_in_page)
1926 # A little optimization - if current page is not
1927 # "full", ie. does not contain PAGE_SIZE video ids then
1928 # we can assume that this page is the last one - there
1929 # are no more ids on further pages - no need to query
1932 if len(ids_in_page) < self._PAGE_SIZE:
1937 all_ids_count = len(video_ids)
1938 playliststart = self._downloader.params.get('playliststart', 1) - 1
1939 playlistend = self._downloader.params.get('playlistend', -1)
1941 if playlistend == -1:
1942 video_ids = video_ids[playliststart:]
1944 video_ids = video_ids[playliststart:playlistend]
1946 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1947 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1949 for video_id in video_ids:
1950 self._downloader.download([u'http://blip.tv/'+video_id])
1953 class DepositFilesIE(InfoExtractor):
1954 """Information extractor for depositfiles.com"""
1956 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1958 def report_download_webpage(self, file_id):
1959 """Report webpage download."""
1960 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1962 def report_extraction(self, file_id):
1963 """Report information extraction."""
1964 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1966 def _real_extract(self, url):
1967 file_id = url.split('/')[-1]
1968 # Rebuild url in english locale
1969 url = 'http://depositfiles.com/en/files/' + file_id
1971 # Retrieve file webpage with 'Free download' button pressed
1972 free_download_indication = { 'gateway_result' : '1' }
1973 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1975 self.report_download_webpage(file_id)
1976 webpage = compat_urllib_request.urlopen(request).read()
1977 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1978 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1981 # Search for the real file URL
1982 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1983 if (mobj is None) or (mobj.group(1) is None):
1984 # Try to figure out reason of the error.
1985 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1986 if (mobj is not None) and (mobj.group(1) is not None):
1987 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1988 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1990 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1993 file_url = mobj.group(1)
1994 file_extension = os.path.splitext(file_url)[1][1:]
1996 # Search for file title
1997 mobj = re.search(r'<b title="(.*?)">', webpage)
1999 self._downloader.trouble(u'ERROR: unable to extract title')
2001 file_title = mobj.group(1).decode('utf-8')
2004 'id': file_id.decode('utf-8'),
2005 'url': file_url.decode('utf-8'),
2007 'upload_date': None,
2008 'title': file_title,
2009 'ext': file_extension.decode('utf-8'),
2013 class FacebookIE(InfoExtractor):
2014 """Information Extractor for Facebook"""
2016 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2017 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2018 _NETRC_MACHINE = 'facebook'
2019 IE_NAME = u'facebook'
2021 def report_login(self):
2022 """Report attempt to log in."""
2023 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2025 def _real_initialize(self):
2026 if self._downloader is None:
2031 downloader_params = self._downloader.params
2033 # Attempt to use provided username and password or .netrc data
2034 if downloader_params.get('username', None) is not None:
2035 useremail = downloader_params['username']
2036 password = downloader_params['password']
2037 elif downloader_params.get('usenetrc', False):
2039 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2040 if info is not None:
2044 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2045 except (IOError, netrc.NetrcParseError) as err:
2046 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2049 if useremail is None:
2058 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2061 login_results = compat_urllib_request.urlopen(request).read()
2062 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2063 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2065 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2066 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2069 def _real_extract(self, url):
2070 mobj = re.match(self._VALID_URL, url)
2072 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2074 video_id = mobj.group('ID')
2076 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2077 webpage = self._download_webpage(url, video_id)
2079 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2080 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2081 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2083 raise ExtractorError(u'Cannot parse data')
2084 data = dict(json.loads(m.group(1)))
2085 params_raw = compat_urllib_parse.unquote(data['params'])
2086 params = json.loads(params_raw)
2087 video_url = params['hd_src']
2089 video_url = params['sd_src']
2091 raise ExtractorError(u'Cannot find video URL')
2092 video_duration = int(params['video_duration'])
2094 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2096 raise ExtractorError(u'Cannot find title in webpage')
2097 video_title = unescapeHTML(m.group(1))
2101 'title': video_title,
2104 'duration': video_duration,
2105 'thumbnail': params['thumbnail_src'],
2110 class BlipTVIE(InfoExtractor):
2111 """Information extractor for blip.tv"""
2113 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2114 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2115 IE_NAME = u'blip.tv'
2117 def report_extraction(self, file_id):
2118 """Report information extraction."""
2119 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2121 def report_direct_download(self, title):
2122 """Report information extraction."""
2123 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2125 def _real_extract(self, url):
2126 mobj = re.match(self._VALID_URL, url)
2128 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2135 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2136 request = compat_urllib_request.Request(json_url)
2137 request.add_header('User-Agent', 'iTunes/10.6.1')
2138 self.report_extraction(mobj.group(1))
2141 urlh = compat_urllib_request.urlopen(request)
2142 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2143 basename = url.split('/')[-1]
2144 title,ext = os.path.splitext(basename)
2145 title = title.decode('UTF-8')
2146 ext = ext.replace('.', '')
2147 self.report_direct_download(title)
2152 'upload_date': None,
2157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2158 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2159 if info is None: # Regular URL
2161 json_code_bytes = urlh.read()
2162 json_code = json_code_bytes.decode('utf-8')
2163 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2164 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2168 json_data = json.loads(json_code)
2169 if 'Post' in json_data:
2170 data = json_data['Post']
2174 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2175 video_url = data['media']['url']
2176 umobj = re.match(self._URL_EXT, video_url)
2178 raise ValueError('Can not determine filename extension')
2179 ext = umobj.group(1)
2182 'id': data['item_id'],
2184 'uploader': data['display_name'],
2185 'upload_date': upload_date,
2186 'title': data['title'],
2188 'format': data['media']['mimeType'],
2189 'thumbnail': data['thumbnailUrl'],
2190 'description': data['description'],
2191 'player_url': data['embedUrl'],
2192 'user_agent': 'iTunes/10.6.1',
2194 except (ValueError,KeyError) as err:
2195 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2201 class MyVideoIE(InfoExtractor):
2202 """Information Extractor for myvideo.de."""
2204 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2205 IE_NAME = u'myvideo'
2207 def __init__(self, downloader=None):
2208 InfoExtractor.__init__(self, downloader)
2210 def report_extraction(self, video_id):
2211 """Report information extraction."""
2212 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2214 def _real_extract(self,url):
2215 mobj = re.match(self._VALID_URL, url)
2217 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2220 video_id = mobj.group(1)
2223 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2224 webpage = self._download_webpage(webpage_url, video_id)
2226 self.report_extraction(video_id)
2227 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2230 self._downloader.trouble(u'ERROR: unable to extract media URL')
2232 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2234 mobj = re.search('<title>([^<]+)</title>', webpage)
2236 self._downloader.trouble(u'ERROR: unable to extract title')
2239 video_title = mobj.group(1)
2245 'upload_date': None,
2246 'title': video_title,
2250 class ComedyCentralIE(InfoExtractor):
2251 """Information extractor for The Daily Show and Colbert Report """
2253 # urls can be abbreviations like :thedailyshow or :colbert
2254 # urls for episodes like:
2255 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259 |(https?://)?(www\.)?
2260 (?P<showname>thedailyshow|colbertnation)\.com/
2261 (full-episodes/(?P<episode>.*)|
2263 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2267 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2269 _video_extensions = {
2277 _video_dimensions = {
2286 def suitable(self, url):
2287 """Receives a URL and returns True if suitable for this IE."""
2288 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2290 def report_extraction(self, episode_id):
2291 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2293 def report_config_download(self, episode_id, media_id):
2294 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2296 def report_index_download(self, episode_id):
2297 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2299 def _print_formats(self, formats):
2300 print('Available formats:')
2302 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2305 def _real_extract(self, url):
2306 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2308 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2311 if mobj.group('shortname'):
2312 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2313 url = u'http://www.thedailyshow.com/full-episodes/'
2315 url = u'http://www.colbertnation.com/full-episodes/'
2316 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2317 assert mobj is not None
2319 if mobj.group('clip'):
2320 if mobj.group('showname') == 'thedailyshow':
2321 epTitle = mobj.group('tdstitle')
2323 epTitle = mobj.group('cntitle')
2326 dlNewest = not mobj.group('episode')
2328 epTitle = mobj.group('showname')
2330 epTitle = mobj.group('episode')
2332 req = compat_urllib_request.Request(url)
2333 self.report_extraction(epTitle)
2335 htmlHandle = compat_urllib_request.urlopen(req)
2336 html = htmlHandle.read()
2337 webpage = html.decode('utf-8')
2338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2339 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2342 url = htmlHandle.geturl()
2343 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2345 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2347 if mobj.group('episode') == '':
2348 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2350 epTitle = mobj.group('episode')
2352 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2354 if len(mMovieParams) == 0:
2355 # The Colbert Report embeds the information in a without
2356 # a URL prefix; so extract the alternate reference
2357 # and then add the URL prefix manually.
2359 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2360 if len(altMovieParams) == 0:
2361 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2364 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2366 uri = mMovieParams[0][1]
2367 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2368 self.report_index_download(epTitle)
2370 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2372 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2377 idoc = xml.etree.ElementTree.fromstring(indexXml)
2378 itemEls = idoc.findall('.//item')
2379 for partNum,itemEl in enumerate(itemEls):
2380 mediaId = itemEl.findall('./guid')[0].text
2381 shortMediaId = mediaId.split(':')[-1]
2382 showId = mediaId.split(':')[-2].replace('.com', '')
2383 officialTitle = itemEl.findall('./title')[0].text
2384 officialDate = itemEl.findall('./pubDate')[0].text
2386 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2387 compat_urllib_parse.urlencode({'uri': mediaId}))
2388 configReq = compat_urllib_request.Request(configUrl)
2389 self.report_config_download(epTitle, shortMediaId)
2391 configXml = compat_urllib_request.urlopen(configReq).read()
2392 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2393 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2396 cdoc = xml.etree.ElementTree.fromstring(configXml)
2398 for rendition in cdoc.findall('.//rendition'):
2399 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2403 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2406 if self._downloader.params.get('listformats', None):
2407 self._print_formats([i[0] for i in turls])
2410 # For now, just pick the highest bitrate
2411 format,rtmp_video_url = turls[-1]
2413 # Get the format arg from the arg stream
2414 req_format = self._downloader.params.get('format', None)
2416 # Select format if we can find one
2419 format, rtmp_video_url = f, v
2422 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2424 raise ExtractorError(u'Cannot transform RTMP url')
2425 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2426 video_url = base + m.group('finalid')
2428 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2433 'upload_date': officialDate,
2438 'description': officialTitle,
2440 results.append(info)
2445 class EscapistIE(InfoExtractor):
2446 """Information extractor for The Escapist """
2448 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2449 IE_NAME = u'escapist'
2451 def report_extraction(self, showName):
2452 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2454 def report_config_download(self, showName):
2455 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2457 def _real_extract(self, url):
2458 mobj = re.match(self._VALID_URL, url)
2460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2462 showName = mobj.group('showname')
2463 videoId = mobj.group('episode')
2465 self.report_extraction(showName)
2467 webPage = compat_urllib_request.urlopen(url)
2468 webPageBytes = webPage.read()
2469 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2470 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2471 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2472 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2475 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2476 description = unescapeHTML(descMatch.group(1))
2477 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2478 imgUrl = unescapeHTML(imgMatch.group(1))
2479 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2480 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2481 configUrlMatch = re.search('config=(.*)$', playerUrl)
2482 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2484 self.report_config_download(showName)
2486 configJSON = compat_urllib_request.urlopen(configUrl)
2487 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2488 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2489 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2490 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2493 # Technically, it's JavaScript, not JSON
2494 configJSON = configJSON.replace("'", '"')
2497 config = json.loads(configJSON)
2498 except (ValueError,) as err:
2499 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2502 playlist = config['playlist']
2503 videoUrl = playlist[1]['url']
2508 'uploader': showName,
2509 'upload_date': None,
2512 'thumbnail': imgUrl,
2513 'description': description,
2514 'player_url': playerUrl,
2519 class CollegeHumorIE(InfoExtractor):
2520 """Information extractor for collegehumor.com"""
2523 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2524 IE_NAME = u'collegehumor'
2526 def report_manifest(self, video_id):
2527 """Report information extraction."""
2528 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2530 def report_extraction(self, video_id):
2531 """Report information extraction."""
2532 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2534 def _real_extract(self, url):
2535 mobj = re.match(self._VALID_URL, url)
2537 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2539 video_id = mobj.group('videoid')
2544 'upload_date': None,
2547 self.report_extraction(video_id)
2548 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2550 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2551 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2552 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2555 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2557 videoNode = mdoc.findall('./video')[0]
2558 info['description'] = videoNode.findall('./description')[0].text
2559 info['title'] = videoNode.findall('./caption')[0].text
2560 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2561 manifest_url = videoNode.findall('./file')[0].text
2563 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2566 manifest_url += '?hdcore=2.10.3'
2567 self.report_manifest(video_id)
2569 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2570 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2574 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2576 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2577 node_id = media_node.attrib['url']
2578 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2579 except IndexError as err:
2580 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2583 url_pr = compat_urllib_parse_urlparse(manifest_url)
2584 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2591 class XVideosIE(InfoExtractor):
2592 """Information extractor for xvideos.com"""
2594 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2595 IE_NAME = u'xvideos'
2597 def report_extraction(self, video_id):
2598 """Report information extraction."""
2599 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2601 def _real_extract(self, url):
2602 mobj = re.match(self._VALID_URL, url)
2604 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2606 video_id = mobj.group(1)
2608 webpage = self._download_webpage(url, video_id)
2610 self.report_extraction(video_id)
2614 mobj = re.search(r'flv_url=(.+?)&', webpage)
2616 self._downloader.trouble(u'ERROR: unable to extract video url')
2618 video_url = compat_urllib_parse.unquote(mobj.group(1))
2622 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2624 self._downloader.trouble(u'ERROR: unable to extract video title')
2626 video_title = mobj.group(1)
2629 # Extract video thumbnail
2630 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2632 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2634 video_thumbnail = mobj.group(0)
2640 'upload_date': None,
2641 'title': video_title,
2643 'thumbnail': video_thumbnail,
2644 'description': None,
2650 class SoundcloudIE(InfoExtractor):
2651 """Information extractor for soundcloud.com
2652 To access the media, the uid of the song and a stream token
2653 must be extracted from the page source and the script must make
2654 a request to media.soundcloud.com/crossdomain.xml. Then
2655 the media can be grabbed by requesting from an url composed
2656 of the stream token and uid
2659 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2660 IE_NAME = u'soundcloud'
2662 def __init__(self, downloader=None):
2663 InfoExtractor.__init__(self, downloader)
2665 def report_resolve(self, video_id):
2666 """Report information extraction."""
2667 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2669 def report_extraction(self, video_id):
2670 """Report information extraction."""
2671 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2673 def _real_extract(self, url):
2674 mobj = re.match(self._VALID_URL, url)
2676 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2679 # extract uploader (which is in the url)
2680 uploader = mobj.group(1)
2681 # extract simple title (uploader + slug of song title)
2682 slug_title = mobj.group(2)
2683 simple_title = uploader + u'-' + slug_title
2685 self.report_resolve('%s/%s' % (uploader, slug_title))
2687 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2688 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2689 request = compat_urllib_request.Request(resolv_url)
2691 info_json_bytes = compat_urllib_request.urlopen(request).read()
2692 info_json = info_json_bytes.decode('utf-8')
2693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2697 info = json.loads(info_json)
2698 video_id = info['id']
2699 self.report_extraction('%s/%s' % (uploader, slug_title))
2701 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2702 request = compat_urllib_request.Request(streams_url)
2704 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2705 stream_json = stream_json_bytes.decode('utf-8')
2706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2710 streams = json.loads(stream_json)
2711 mediaURL = streams['http_mp3_128_url']
2716 'uploader': info['user']['username'],
2717 'upload_date': info['created_at'],
2718 'title': info['title'],
2720 'description': info['description'],
2724 class InfoQIE(InfoExtractor):
2725 """Information extractor for infoq.com"""
2726 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2728 def report_extraction(self, video_id):
2729 """Report information extraction."""
2730 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2732 def _real_extract(self, url):
2733 mobj = re.match(self._VALID_URL, url)
2735 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2738 webpage = self._download_webpage(url, video_id=url)
2739 self.report_extraction(url)
2742 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2744 self._downloader.trouble(u'ERROR: unable to extract video url')
2746 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2747 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2750 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2752 self._downloader.trouble(u'ERROR: unable to extract video title')
2754 video_title = mobj.group(1)
2756 # Extract description
2757 video_description = u'No description available.'
2758 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2759 if mobj is not None:
2760 video_description = mobj.group(1)
2762 video_filename = video_url.split('/')[-1]
2763 video_id, extension = video_filename.split('.')
2769 'upload_date': None,
2770 'title': video_title,
2771 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2773 'description': video_description,
2778 class MixcloudIE(InfoExtractor):
2779 """Information extractor for www.mixcloud.com"""
2781 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2782 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2783 IE_NAME = u'mixcloud'
2785 def __init__(self, downloader=None):
2786 InfoExtractor.__init__(self, downloader)
2788 def report_download_json(self, file_id):
2789 """Report JSON download."""
2790 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2792 def report_extraction(self, file_id):
2793 """Report information extraction."""
2794 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2796 def get_urls(self, jsonData, fmt, bitrate='best'):
2797 """Get urls from 'audio_formats' section in json"""
2800 bitrate_list = jsonData[fmt]
2801 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2802 bitrate = max(bitrate_list) # select highest
2804 url_list = jsonData[fmt][bitrate]
2805 except TypeError: # we have no bitrate info.
2806 url_list = jsonData[fmt]
2809 def check_urls(self, url_list):
2810 """Returns 1st active url from list"""
2811 for url in url_list:
2813 compat_urllib_request.urlopen(url)
2815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2820 def _print_formats(self, formats):
2821 print('Available formats:')
2822 for fmt in formats.keys():
2823 for b in formats[fmt]:
2825 ext = formats[fmt][b][0]
2826 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2827 except TypeError: # we have no bitrate info
2828 ext = formats[fmt][0]
2829 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2832 def _real_extract(self, url):
2833 mobj = re.match(self._VALID_URL, url)
2835 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2837 # extract uploader & filename from url
2838 uploader = mobj.group(1).decode('utf-8')
2839 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2841 # construct API request
2842 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2843 # retrieve .json file with links to files
2844 request = compat_urllib_request.Request(file_url)
2846 self.report_download_json(file_url)
2847 jsonData = compat_urllib_request.urlopen(request).read()
2848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2849 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2853 json_data = json.loads(jsonData)
2854 player_url = json_data['player_swf_url']
2855 formats = dict(json_data['audio_formats'])
2857 req_format = self._downloader.params.get('format', None)
2860 if self._downloader.params.get('listformats', None):
2861 self._print_formats(formats)
2864 if req_format is None or req_format == 'best':
2865 for format_param in formats.keys():
2866 url_list = self.get_urls(formats, format_param)
2868 file_url = self.check_urls(url_list)
2869 if file_url is not None:
2872 if req_format not in formats:
2873 self._downloader.trouble(u'ERROR: format is not available')
2876 url_list = self.get_urls(formats, req_format)
2877 file_url = self.check_urls(url_list)
2878 format_param = req_format
2881 'id': file_id.decode('utf-8'),
2882 'url': file_url.decode('utf-8'),
2883 'uploader': uploader.decode('utf-8'),
2884 'upload_date': None,
2885 'title': json_data['name'],
2886 'ext': file_url.split('.')[-1].decode('utf-8'),
2887 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2888 'thumbnail': json_data['thumbnail_url'],
2889 'description': json_data['description'],
2890 'player_url': player_url.decode('utf-8'),
2893 class StanfordOpenClassroomIE(InfoExtractor):
2894 """Information extractor for Stanford's Open ClassRoom"""
2896 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2897 IE_NAME = u'stanfordoc'
2899 def report_download_webpage(self, objid):
2900 """Report information extraction."""
2901 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2903 def report_extraction(self, video_id):
2904 """Report information extraction."""
2905 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2907 def _real_extract(self, url):
2908 mobj = re.match(self._VALID_URL, url)
2910 raise ExtractorError(u'Invalid URL: %s' % url)
2912 if mobj.group('course') and mobj.group('video'): # A specific video
2913 course = mobj.group('course')
2914 video = mobj.group('video')
2916 'id': course + '_' + video,
2918 'upload_date': None,
2921 self.report_extraction(info['id'])
2922 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2923 xmlUrl = baseUrl + video + '.xml'
2925 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2929 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2931 info['title'] = mdoc.findall('./title')[0].text
2932 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2934 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2936 info['ext'] = info['url'].rpartition('.')[2]
2938 elif mobj.group('course'): # A course page
2939 course = mobj.group('course')
2944 'upload_date': None,
2947 coursepage = self._download_webpage(url, info['id'],
2948 note='Downloading course info page',
2949 errnote='Unable to download course info page')
2951 m = re.search('<h1>([^<]+)</h1>', coursepage)
2953 info['title'] = unescapeHTML(m.group(1))
2955 info['title'] = info['id']
2957 m = re.search('<description>([^<]+)</description>', coursepage)
2959 info['description'] = unescapeHTML(m.group(1))
2961 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2964 'type': 'reference',
2965 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2969 for entry in info['list']:
2970 assert entry['type'] == 'reference'
2971 results += self.extract(entry['url'])
2975 'id': 'Stanford OpenClassroom',
2978 'upload_date': None,
2981 self.report_download_webpage(info['id'])
2982 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2984 rootpage = compat_urllib_request.urlopen(rootURL).read()
2985 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2986 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2989 info['title'] = info['id']
2991 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2994 'type': 'reference',
2995 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3000 for entry in info['list']:
3001 assert entry['type'] == 'reference'
3002 results += self.extract(entry['url'])
3005 class MTVIE(InfoExtractor):
3006 """Information extractor for MTV.com"""
3008 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3011 def report_extraction(self, video_id):
3012 """Report information extraction."""
3013 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3015 def _real_extract(self, url):
3016 mobj = re.match(self._VALID_URL, url)
3018 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3020 if not mobj.group('proto'):
3021 url = 'http://' + url
3022 video_id = mobj.group('videoid')
3024 webpage = self._download_webpage(url, video_id)
3026 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3028 self._downloader.trouble(u'ERROR: unable to extract song name')
3030 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3031 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3033 self._downloader.trouble(u'ERROR: unable to extract performer')
3035 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3036 video_title = performer + ' - ' + song_name
3038 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3040 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3042 mtvn_uri = mobj.group(1)
3044 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3046 self._downloader.trouble(u'ERROR: unable to extract content id')
3048 content_id = mobj.group(1)
3050 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3051 self.report_extraction(video_id)
3052 request = compat_urllib_request.Request(videogen_url)
3054 metadataXml = compat_urllib_request.urlopen(request).read()
3055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3056 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3059 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3060 renditions = mdoc.findall('.//rendition')
3062 # For now, always pick the highest quality.
3063 rendition = renditions[-1]
3066 _,_,ext = rendition.attrib['type'].partition('/')
3067 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3068 video_url = rendition.find('./src').text
3070 self._downloader.trouble('Invalid rendition field.')
3076 'uploader': performer,
3077 'upload_date': None,
3078 'title': video_title,
3086 class YoukuIE(InfoExtractor):
3087 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3089 def report_download_webpage(self, file_id):
3090 """Report webpage download."""
3091 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3093 def report_extraction(self, file_id):
3094 """Report information extraction."""
3095 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3098 nowTime = int(time.time() * 1000)
3099 random1 = random.randint(1000,1998)
3100 random2 = random.randint(1000,9999)
3102 return "%d%d%d" %(nowTime,random1,random2)
3104 def _get_file_ID_mix_string(self, seed):
3106 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3108 for i in range(len(source)):
3109 seed = (seed * 211 + 30031 ) % 65536
3110 index = math.floor(seed / 65536 * len(source) )
3111 mixed.append(source[int(index)])
3112 source.remove(source[int(index)])
3113 #return ''.join(mixed)
3116 def _get_file_id(self, fileId, seed):
3117 mixed = self._get_file_ID_mix_string(seed)
3118 ids = fileId.split('*')
3122 realId.append(mixed[int(ch)])
3123 return ''.join(realId)
3125 def _real_extract(self, url):
3126 mobj = re.match(self._VALID_URL, url)
3128 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3130 video_id = mobj.group('ID')
3132 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3134 request = compat_urllib_request.Request(info_url, None, std_headers)
3136 self.report_download_webpage(video_id)
3137 jsondata = compat_urllib_request.urlopen(request).read()
3138 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3139 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3142 self.report_extraction(video_id)
3144 jsonstr = jsondata.decode('utf-8')
3145 config = json.loads(jsonstr)
3147 video_title = config['data'][0]['title']
3148 seed = config['data'][0]['seed']
3150 format = self._downloader.params.get('format', None)
3151 supported_format = list(config['data'][0]['streamfileids'].keys())
3153 if format is None or format == 'best':
3154 if 'hd2' in supported_format:
3159 elif format == 'worst':
3167 fileid = config['data'][0]['streamfileids'][format]
3168 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3169 except (UnicodeDecodeError, ValueError, KeyError):
3170 self._downloader.trouble(u'ERROR: unable to extract info section')
3174 sid = self._gen_sid()
3175 fileid = self._get_file_id(fileid, seed)
3177 #column 8,9 of fileid represent the segment number
3178 #fileid[7:9] should be changed
3179 for index, key in enumerate(keys):
3181 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3182 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3185 'id': '%s_part%02d' % (video_id, index),
3186 'url': download_url,
3188 'upload_date': None,
3189 'title': video_title,
3192 files_info.append(info)
3197 class XNXXIE(InfoExtractor):
3198 """Information extractor for xnxx.com"""
3200 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3202 VIDEO_URL_RE = r'flv_url=(.*?)&'
3203 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3204 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3206 def report_webpage(self, video_id):
3207 """Report information extraction"""
3208 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3210 def report_extraction(self, video_id):
3211 """Report information extraction"""
3212 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3214 def _real_extract(self, url):
3215 mobj = re.match(self._VALID_URL, url)
3217 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3219 video_id = mobj.group(1)
3221 self.report_webpage(video_id)
3223 # Get webpage content
3225 webpage_bytes = compat_urllib_request.urlopen(url).read()
3226 webpage = webpage_bytes.decode('utf-8')
3227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3231 result = re.search(self.VIDEO_URL_RE, webpage)
3233 self._downloader.trouble(u'ERROR: unable to extract video url')
3235 video_url = compat_urllib_parse.unquote(result.group(1))
3237 result = re.search(self.VIDEO_TITLE_RE, webpage)
3239 self._downloader.trouble(u'ERROR: unable to extract video title')
3241 video_title = result.group(1)
3243 result = re.search(self.VIDEO_THUMB_RE, webpage)
3245 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3247 video_thumbnail = result.group(1)
3253 'upload_date': None,
3254 'title': video_title,
3256 'thumbnail': video_thumbnail,
3257 'description': None,
3261 class GooglePlusIE(InfoExtractor):
3262 """Information extractor for plus.google.com."""
3264 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3265 IE_NAME = u'plus.google'
3267 def __init__(self, downloader=None):
3268 InfoExtractor.__init__(self, downloader)
3270 def report_extract_entry(self, url):
3271 """Report downloading extry"""
3272 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3274 def report_date(self, upload_date):
3275 """Report downloading extry"""
3276 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3278 def report_uploader(self, uploader):
3279 """Report downloading extry"""
3280 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3282 def report_title(self, video_title):
3283 """Report downloading extry"""
3284 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3286 def report_extract_vid_page(self, video_page):
3287 """Report information extraction."""
3288 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3290 def _real_extract(self, url):
3291 # Extract id from URL
3292 mobj = re.match(self._VALID_URL, url)
3294 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3297 post_url = mobj.group(0)
3298 video_id = mobj.group(1)
3300 video_extension = 'flv'
3302 # Step 1, Retrieve post webpage to extract further information
3303 self.report_extract_entry(post_url)
3304 request = compat_urllib_request.Request(post_url)
3306 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3311 # Extract update date
3313 pattern = 'title="Timestamp">(.*?)</a>'
3314 mobj = re.search(pattern, webpage)
3316 upload_date = mobj.group(1)
3317 # Convert timestring to a format suitable for filename
3318 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3319 upload_date = upload_date.strftime('%Y%m%d')
3320 self.report_date(upload_date)
3324 pattern = r'rel\="author".*?>(.*?)</a>'
3325 mobj = re.search(pattern, webpage)
3327 uploader = mobj.group(1)
3328 self.report_uploader(uploader)
3331 # Get the first line for title
3333 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3334 mobj = re.search(pattern, webpage)
3336 video_title = mobj.group(1)
3337 self.report_title(video_title)
3339 # Step 2, Stimulate clicking the image box to launch video
3340 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3341 mobj = re.search(pattern, webpage)
3343 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3345 video_page = mobj.group(1)
3346 request = compat_urllib_request.Request(video_page)
3348 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3349 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3350 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3352 self.report_extract_vid_page(video_page)
3355 # Extract video links on video page
3356 """Extract video links of all sizes"""
3357 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3358 mobj = re.findall(pattern, webpage)
3360 self._downloader.trouble(u'ERROR: unable to extract video links')
3362 # Sort in resolution
3363 links = sorted(mobj)
3365 # Choose the lowest of the sort, i.e. highest resolution
3366 video_url = links[-1]
3367 # Only get the url. The resolution part in the tuple has no use anymore
3368 video_url = video_url[-1]
3369 # Treat escaped \u0026 style hex
3371 video_url = video_url.decode("unicode_escape")
3372 except AttributeError: # Python 3
3373 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3379 'uploader': uploader,
3380 'upload_date': upload_date,
3381 'title': video_title,
3382 'ext': video_extension,
3385 class NBAIE(InfoExtractor):
3386 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3389 def _real_extract(self, url):
3390 mobj = re.match(self._VALID_URL, url)
3392 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3395 video_id = mobj.group(1)
3396 if video_id.endswith('/index.html'):
3397 video_id = video_id[:-len('/index.html')]
3399 webpage = self._download_webpage(url, video_id)
3401 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3402 def _findProp(rexp, default=None):
3403 m = re.search(rexp, webpage)
3405 return unescapeHTML(m.group(1))
3409 shortened_video_id = video_id.rpartition('/')[2]
3410 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3412 'id': shortened_video_id,
3416 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3417 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3421 class JustinTVIE(InfoExtractor):
3422 """Information extractor for justin.tv and twitch.tv"""
3423 # TODO: One broadcast may be split into multiple videos. The key
3424 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3425 # starts at 1 and increases. Can we treat all parts as one video?
3427 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3428 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3429 _JUSTIN_PAGE_LIMIT = 100
3430 IE_NAME = u'justin.tv'
3432 def report_extraction(self, file_id):
3433 """Report information extraction."""
3434 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3436 def report_download_page(self, channel, offset):
3437 """Report attempt to download a single page of videos."""
3438 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3439 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3441 # Return count of items, list of *valid* items
3442 def _parse_page(self, url):
3444 urlh = compat_urllib_request.urlopen(url)
3445 webpage_bytes = urlh.read()
3446 webpage = webpage_bytes.decode('utf-8', 'ignore')
3447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3448 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3451 response = json.loads(webpage)
3452 if type(response) != list:
3453 error_text = response.get('error', 'unknown error')
3454 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3457 for clip in response:
3458 video_url = clip['video_file_url']
3460 video_extension = os.path.splitext(video_url)[1][1:]
3461 video_date = re.sub('-', '', clip['start_time'][:10])
3462 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3463 video_id = clip['id']
3464 video_title = clip.get('title', video_id)
3468 'title': video_title,
3469 'uploader': clip.get('channel_name', video_uploader_id),
3470 'uploader_id': video_uploader_id,
3471 'upload_date': video_date,
3472 'ext': video_extension,
3474 return (len(response), info)
3476 def _real_extract(self, url):
3477 mobj = re.match(self._VALID_URL, url)
3479 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3482 api = 'http://api.justin.tv'
3483 video_id = mobj.group(mobj.lastindex)
3485 if mobj.lastindex == 1:
3487 api += '/channel/archives/%s.json'
3489 api += '/broadcast/by_archive/%s.json'
3490 api = api % (video_id,)
3492 self.report_extraction(video_id)
3496 limit = self._JUSTIN_PAGE_LIMIT
3499 self.report_download_page(video_id, offset)
3500 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3501 page_count, page_info = self._parse_page(page_url)
3502 info.extend(page_info)
3503 if not paged or page_count != limit:
3508 class FunnyOrDieIE(InfoExtractor):
3509 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3511 def _real_extract(self, url):
3512 mobj = re.match(self._VALID_URL, url)
3514 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3517 video_id = mobj.group('id')
3518 webpage = self._download_webpage(url, video_id)
3520 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3522 self._downloader.trouble(u'ERROR: unable to find video information')
3523 video_url = unescapeHTML(m.group('url'))
3525 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3527 self._downloader.trouble(u'Cannot find video title')
3528 title = unescapeHTML(m.group('title'))
3530 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3532 desc = unescapeHTML(m.group('desc'))
3541 'description': desc,
3545 class TweetReelIE(InfoExtractor):
3546 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3548 def _real_extract(self, url):
3549 mobj = re.match(self._VALID_URL, url)
3551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3554 video_id = mobj.group('id')
3555 webpage = self._download_webpage(url, video_id)
3557 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3559 self._downloader.trouble(u'ERROR: Cannot find status ID')
3560 status_id = m.group(1)
3562 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3564 self._downloader.trouble(u'WARNING: Cannot find description')
3565 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3567 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3569 self._downloader.trouble(u'ERROR: Cannot find uploader')
3570 uploader = unescapeHTML(m.group('uploader'))
3571 uploader_id = unescapeHTML(m.group('uploader_id'))
3573 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3575 self._downloader.trouble(u'ERROR: Cannot find upload date')
3576 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3579 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3586 'description': desc,
3587 'uploader': uploader,
3588 'uploader_id': uploader_id,
3589 'internal_id': status_id,
3590 'upload_date': upload_date
3594 class SteamIE(InfoExtractor):
3595 _VALID_URL = r"""http://store.steampowered.com/
3596 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3598 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3601 def suitable(self, url):
3602 """Receives a URL and returns True if suitable for this IE."""
3603 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3605 def _real_extract(self, url):
3606 m = re.match(self._VALID_URL, url, re.VERBOSE)
3607 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3608 gameID = m.group('gameID')
3609 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3610 webpage = self._download_webpage(videourl, gameID)
3611 mweb = re.finditer(urlRE, webpage)
3612 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3613 titles = re.finditer(namesRE, webpage)
3615 for vid,vtitle in zip(mweb,titles):
3616 video_id = vid.group('videoID')
3617 title = vtitle.group('videoName')
3618 video_url = vid.group('videoURL')
3620 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3625 'title': unescapeHTML(title)
3630 class UstreamIE(InfoExtractor):
3631 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3632 IE_NAME = u'ustream'
3634 def _real_extract(self, url):
3635 m = re.match(self._VALID_URL, url)
3636 video_id = m.group('videoID')
3637 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3638 webpage = self._download_webpage(url, video_id)
3639 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3640 title = m.group('title')
3641 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3642 uploader = m.group('uploader')
3648 'uploader': uploader
3652 class RBMARadioIE(InfoExtractor):
3653 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3655 def _real_extract(self, url):
3656 m = re.match(self._VALID_URL, url)
3657 video_id = m.group('videoID')
3659 webpage = self._download_webpage(url, video_id)
3660 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3662 raise ExtractorError(u'Cannot find metadata')
3663 json_data = m.group(1)
3666 data = json.loads(json_data)
3667 except ValueError as e:
3668 raise ExtractorError(u'Invalid JSON: ' + str(e))
3670 video_url = data['akamai_url'] + '&cbr=256'
3671 url_parts = compat_urllib_parse_urlparse(video_url)
3672 video_ext = url_parts.path.rpartition('.')[2]
3677 'title': data['title'],
3678 'description': data.get('teaser_text'),
3679 'location': data.get('country_of_origin'),
3680 'uploader': data.get('host', {}).get('name'),
3681 'uploader_id': data.get('host', {}).get('slug'),
3682 'thumbnail': data.get('image', {}).get('large_url_2x'),
3683 'duration': data.get('duration'),
3688 class YouPornIE(InfoExtractor):
3689 """Information extractor for youporn.com."""
3690 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3692 def _print_formats(self, formats):
3693 """Print all available formats"""
3694 print(u'Available formats:')
3695 print(u'ext\t\tformat')
3696 print(u'---------------------------------')
3697 for format in formats:
3698 print(u'%s\t\t%s' % (format['ext'], format['format']))
3700 def _specific(self, req_format, formats):
3702 if(x["format"]==req_format):
3706 def _real_extract(self, url):
3707 mobj = re.match(self._VALID_URL, url)
3709 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3712 video_id = mobj.group('videoid')
3714 req = compat_urllib_request.Request(url)
3715 req.add_header('Cookie', 'age_verified=1')
3716 webpage = self._download_webpage(req, video_id)
3718 # Get the video title
3719 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3721 raise ExtractorError(u'Unable to extract video title')
3722 video_title = result.group('title').strip()
3724 # Get the video date
3725 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3727 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3730 upload_date = result.group('date').strip()
3732 # Get the video uploader
3733 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3735 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3736 video_uploader = None
3738 video_uploader = result.group('uploader').strip()
3739 video_uploader = clean_html( video_uploader )
3741 # Get all of the formats available
3742 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3743 result = re.search(DOWNLOAD_LIST_RE, webpage)
3745 raise ExtractorError(u'Unable to extract download list')
3746 download_list_html = result.group('download_list').strip()
3748 # Get all of the links from the page
3749 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3750 links = re.findall(LINK_RE, download_list_html)
3751 if(len(links) == 0):
3752 raise ExtractorError(u'ERROR: no known formats available for video')
3754 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3759 # A link looks like this:
3760 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3761 # A path looks like this:
3762 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3763 video_url = unescapeHTML( link )
3764 path = compat_urllib_parse_urlparse( video_url ).path
3765 extension = os.path.splitext( path )[1][1:]
3766 format = path.split('/')[4].split('_')[:2]
3769 format = "-".join( format )
3770 title = u'%s-%s-%s' % (video_title, size, bitrate)
3775 'uploader': video_uploader,
3776 'upload_date': upload_date,
3781 'description': None,
3785 if self._downloader.params.get('listformats', None):
3786 self._print_formats(formats)
3789 req_format = self._downloader.params.get('format', None)
3790 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3792 if req_format is None or req_format == 'best':
3794 elif req_format == 'worst':
3795 return [formats[-1]]
3796 elif req_format in ('-1', 'all'):
3799 format = self._specific( req_format, formats )
3801 self._downloader.trouble(u'ERROR: requested format not available')
3807 class PornotubeIE(InfoExtractor):
3808 """Information extractor for pornotube.com."""
3809 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3811 def _real_extract(self, url):
3812 mobj = re.match(self._VALID_URL, url)
3814 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3817 video_id = mobj.group('videoid')
3818 video_title = mobj.group('title')
3820 # Get webpage content
3821 webpage = self._download_webpage(url, video_id)
3824 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3825 result = re.search(VIDEO_URL_RE, webpage)
3827 self._downloader.trouble(u'ERROR: unable to extract video url')
3829 video_url = compat_urllib_parse.unquote(result.group('url'))
3831 #Get the uploaded date
3832 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3833 result = re.search(VIDEO_UPLOADED_RE, webpage)
3835 self._downloader.trouble(u'ERROR: unable to extract video title')
3837 upload_date = result.group('date')
3839 info = {'id': video_id,
3842 'upload_date': upload_date,
3843 'title': video_title,
3849 class YouJizzIE(InfoExtractor):
3850 """Information extractor for youjizz.com."""
3851 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3853 def _real_extract(self, url):
3854 mobj = re.match(self._VALID_URL, url)
3856 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3859 video_id = mobj.group('videoid')
3861 # Get webpage content
3862 webpage = self._download_webpage(url, video_id)
3864 # Get the video title
3865 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3867 raise ExtractorError(u'ERROR: unable to extract video title')
3868 video_title = result.group('title').strip()
3870 # Get the embed page
3871 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3873 raise ExtractorError(u'ERROR: unable to extract embed page')
3875 embed_page_url = result.group(0).strip()
3876 video_id = result.group('videoid')
3878 webpage = self._download_webpage(embed_page_url, video_id)
3881 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3883 raise ExtractorError(u'ERROR: unable to extract video url')
3884 video_url = result.group('source')
3886 info = {'id': video_id,
3888 'title': video_title,
3891 'player_url': embed_page_url}
3895 class EightTracksIE(InfoExtractor):
3897 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3899 def _real_extract(self, url):
3900 mobj = re.match(self._VALID_URL, url)
3902 raise ExtractorError(u'Invalid URL: %s' % url)
3903 playlist_id = mobj.group('id')
3905 webpage = self._download_webpage(url, playlist_id)
3907 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3909 raise ExtractorError(u'Cannot find trax information')
3910 json_like = m.group(1)
3911 data = json.loads(json_like)
3913 session = str(random.randint(0, 1000000000))
3915 track_count = data['tracks_count']
3916 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3917 next_url = first_url
3919 for i in itertools.count():
3920 api_json = self._download_webpage(next_url, playlist_id,
3921 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3922 errnote=u'Failed to download song information')
3923 api_data = json.loads(api_json)
3924 track_data = api_data[u'set']['track']
3926 'id': track_data['id'],
3927 'url': track_data['track_file_stream_url'],
3928 'title': track_data['performer'] + u' - ' + track_data['name'],
3929 'raw_title': track_data['name'],
3930 'uploader_id': data['user']['login'],
3934 if api_data['set']['at_last_track']:
3936 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3939 class KeekIE(InfoExtractor):
3940 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3943 def _real_extract(self, url):
3944 m = re.match(self._VALID_URL, url)
3945 video_id = m.group('videoID')
3946 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3947 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3948 webpage = self._download_webpage(url, video_id)
3949 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3950 title = unescapeHTML(m.group('title'))
3951 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3952 uploader = unescapeHTML(m.group('uploader'))
3958 'thumbnail': thumbnail,
3959 'uploader': uploader
3963 class TEDIE(InfoExtractor):
3964 _VALID_URL=r'''http://www.ted.com/
3966 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3968 ((?P<type_talk>talks)) # We have a simple talk
3970 /(?P<name>\w+) # Here goes the name and then ".html"
3973 def suitable(self, url):
3974 """Receives a URL and returns True if suitable for this IE."""
3975 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3977 def _real_extract(self, url):
3978 m=re.match(self._VALID_URL, url, re.VERBOSE)
3979 if m.group('type_talk'):
3980 return [self._talk_info(url)]
3982 playlist_id=m.group('playlist_id')
3983 name=m.group('name')
3984 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3985 return self._playlist_videos_info(url,name,playlist_id)
3987 def _talk_video_link(self,mediaSlug):
3988 '''Returns the video link for that mediaSlug'''
3989 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3991 def _playlist_videos_info(self,url,name,playlist_id=0):
3992 '''Returns the videos of the playlist'''
3994 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3995 ([.\s]*?)data-playlist_item_id="(\d+)"
3996 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3998 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
3999 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4000 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4001 m_names=re.finditer(video_name_RE,webpage)
4003 for m_video, m_name in zip(m_videos,m_names):
4005 'id': m_video.group('video_id'),
4006 'url': self._talk_video_link(m_video.group('mediaSlug')),
4008 'title': m_name.group('fullname')
4010 info.append(video_dic)
4012 def _talk_info(self, url, video_id=0):
4013 """Return the video for the talk in the url"""
4014 m=re.match(self._VALID_URL, url,re.VERBOSE)
4015 videoName=m.group('name')
4016 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4017 # If the url includes the language we get the title translated
4018 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4019 title=re.search(title_RE, webpage).group('title')
4020 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4021 "id":(?P<videoID>[\d]+).*?
4022 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4023 info_match=re.search(info_RE,webpage,re.VERBOSE)
4024 video_id=info_match.group('videoID')
4025 mediaSlug=info_match.group('mediaSlug')
4026 video_url=self._talk_video_link(mediaSlug)
4035 class MySpassIE(InfoExtractor):
4036 _VALID_URL = r'http://www.myspass.de/.*'
4038 def _real_extract(self, url):
4039 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4041 # video id is the last path element of the URL
4042 # usually there is a trailing slash, so also try the second but last
4043 url_path = compat_urllib_parse_urlparse(url).path
4044 url_parent_path, video_id = os.path.split(url_path)
4046 _, video_id = os.path.split(url_parent_path)
4049 metadata_url = META_DATA_URL_TEMPLATE % video_id
4050 metadata_text = self._download_webpage(metadata_url, video_id)
4051 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4053 # extract values from metadata
4054 url_flv_el = metadata.find('url_flv')
4055 if url_flv_el is None:
4056 self._downloader.trouble(u'ERROR: unable to extract download url')
4058 video_url = url_flv_el.text
4059 extension = os.path.splitext(video_url)[1][1:]
4060 title_el = metadata.find('title')
4061 if title_el is None:
4062 self._downloader.trouble(u'ERROR: unable to extract title')
4064 title = title_el.text
4065 format_id_el = metadata.find('format_id')
4066 if format_id_el is None:
4069 format = format_id_el.text
4070 description_el = metadata.find('description')
4071 if description_el is not None:
4072 description = description_el.text
4075 imagePreview_el = metadata.find('imagePreview')
4076 if imagePreview_el is not None:
4077 thumbnail = imagePreview_el.text
4086 'thumbnail': thumbnail,
4087 'description': description
4091 def gen_extractors():
4092 """ Return a list of an instance of every supported extractor.
4093 The order does matter; the first extractor matched is the one handling the URL.
4096 YoutubePlaylistIE(),
4120 StanfordOpenClassroomIE(),