2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
20 from urlparse import parse_qs, urlparse
25 class InfoExtractor(object):
26 """Information Extractor class.
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
36 The dictionaries must include the following fields:
40 title: Video title, unescaped.
41 ext: Video filename extension.
42 uploader: Full name of the video uploader.
43 upload_date: Video upload date (YYYYMMDD).
45 The following fields are optional:
47 format: The video format, defaults to ext (used for --get-format)
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader_id: Nickname or id of the video uploader.
51 player_url: SWF Player URL (used for rtmpdump).
52 subtitles: The .srt file contents.
53 urlhandle: [internal] The urlHandle to be used to download the file,
54 like returned by urllib.request.urlopen
56 The fields should all be Unicode strings.
58 Subclasses of this one should re-define the _real_initialize() and
59 _real_extract() methods and define a _VALID_URL regexp.
60 Probably, they should also be added to the list of extractors.
62 _real_extract() must return a *list* of information dictionaries as
65 Finally, the _WORKING attribute should be set to False for broken IEs
66 in order to warn the users and skip the tests.
73 def __init__(self, downloader=None):
74 """Constructor. Receives an optional downloader."""
76 self.set_downloader(downloader)
78 def suitable(self, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(self._VALID_URL, url) is not None
83 """Getter method for _WORKING."""
87 """Initializes an instance (authentication, etc)."""
89 self._real_initialize()
92 def extract(self, url):
93 """Extracts URL information and returns it in list of dicts."""
95 return self._real_extract(url)
97 def set_downloader(self, downloader):
98 """Sets the downloader for this IE."""
99 self._downloader = downloader
101 def _real_initialize(self):
102 """Real initialization process. Redefine in subclasses."""
105 def _real_extract(self, url):
106 """Real extraction process. Redefine in subclasses."""
111 return type(self).__name__[:-2]
113 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 note = u'Downloading video webpage'
116 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
118 urlh = compat_urllib_request.urlopen(url_or_request)
119 webpage_bytes = urlh.read()
120 return webpage_bytes.decode('utf-8', 'replace')
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 class YoutubeIE(InfoExtractor):
128 """Information extractor for youtube.com."""
132 (?:https?://)? # http(s):// (optional)
133 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
134 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
135 (?:.*?\#/)? # handle anchor (#/) redirect urls
136 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
137 (?: # the various things that can precede the ID:
138 (?:(?:v|embed|e)/) # v/ or embed/ or e/
139 |(?: # or the v= param in all its forms
140 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
141 (?:\?|\#!?) # the params delimiter ? or # or #!
142 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
145 )? # optional -> youtube.com/xxxx is OK
146 )? # all until now is optional -> you can pass the naked ID
147 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
148 (?(1).+)? # if we found the ID, everything can follow
150 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
151 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
152 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
153 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
154 _NETRC_MACHINE = 'youtube'
155 # Listed in order of quality
156 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
157 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
158 _video_extensions = {
164 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
170 _video_dimensions = {
188 def suitable(self, url):
189 """Receives a URL and returns True if suitable for this IE."""
190 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
192 def report_lang(self):
193 """Report attempt to set language."""
194 self._downloader.to_screen(u'[youtube] Setting language')
196 def report_login(self):
197 """Report attempt to log in."""
198 self._downloader.to_screen(u'[youtube] Logging in')
200 def report_age_confirmation(self):
201 """Report attempt to confirm age."""
202 self._downloader.to_screen(u'[youtube] Confirming age')
204 def report_video_webpage_download(self, video_id):
205 """Report attempt to download video webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
208 def report_video_info_webpage_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
212 def report_video_subtitles_download(self, video_id):
213 """Report attempt to download video info webpage."""
214 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
216 def report_information_extraction(self, video_id):
217 """Report attempt to extract video information."""
218 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
220 def report_unavailable_format(self, video_id, format):
221 """Report extracted video URL."""
222 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
224 def report_rtmp_download(self):
225 """Indicate the download will use the RTMP protocol."""
226 self._downloader.to_screen(u'[youtube] RTMP download detected')
228 def _closed_captions_xml_to_srt(self, xml_string):
230 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
231 # TODO parse xml instead of regex
232 for n, (start, dur_tag, dur, caption) in enumerate(texts):
233 if not dur: dur = '4'
235 end = start + float(dur)
236 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
237 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
238 caption = unescapeHTML(caption)
239 caption = unescapeHTML(caption) # double cycle, intentional
240 srt += str(n+1) + '\n'
241 srt += start + ' --> ' + end + '\n'
242 srt += caption + '\n\n'
245 def _extract_subtitles(self, video_id):
246 self.report_video_subtitles_download(video_id)
247 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
249 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
250 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
251 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
252 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
253 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
254 if not srt_lang_list:
255 return (u'WARNING: video has no closed captions', None)
256 if self._downloader.params.get('subtitleslang', False):
257 srt_lang = self._downloader.params.get('subtitleslang')
258 elif 'en' in srt_lang_list:
261 srt_lang = list(srt_lang_list.keys())[0]
262 if not srt_lang in srt_lang_list:
263 return (u'WARNING: no closed captions found in the specified language', None)
264 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
266 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
267 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
268 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
270 return (u'WARNING: unable to download video subtitles', None)
271 return (None, self._closed_captions_xml_to_srt(srt_xml))
273 def _print_formats(self, formats):
274 print('Available formats:')
276 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
278 def _real_initialize(self):
279 if self._downloader is None:
284 downloader_params = self._downloader.params
286 # Attempt to use provided username and password or .netrc data
287 if downloader_params.get('username', None) is not None:
288 username = downloader_params['username']
289 password = downloader_params['password']
290 elif downloader_params.get('usenetrc', False):
292 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
297 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
298 except (IOError, netrc.NetrcParseError) as err:
299 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
303 request = compat_urllib_request.Request(self._LANG_URL)
306 compat_urllib_request.urlopen(request).read()
307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
308 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
311 # No authentication to be performed
317 'current_form': 'loginForm',
319 'action_login': 'Log In',
320 'username': username,
321 'password': password,
323 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
326 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
327 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
328 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
337 'action_confirm': 'Confirm',
339 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
341 self.report_age_confirmation()
342 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
347 def _extract_id(self, url):
348 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
350 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
352 video_id = mobj.group(2)
355 def _real_extract(self, url):
356 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
357 mobj = re.search(self._NEXT_URL_RE, url)
359 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
360 video_id = self._extract_id(url)
363 self.report_video_webpage_download(video_id)
364 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
365 request = compat_urllib_request.Request(url)
367 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
369 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
372 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
374 # Attempt to extract SWF player URL
375 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
377 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
382 self.report_video_info_webpage_download(video_id)
383 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
384 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
385 % (video_id, el_type))
386 request = compat_urllib_request.Request(video_info_url)
388 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
389 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
390 video_info = compat_parse_qs(video_info_webpage)
391 if 'token' in video_info:
393 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
394 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
396 if 'token' not in video_info:
397 if 'reason' in video_info:
398 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
400 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
403 # Check for "rental" videos
404 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
405 self._downloader.trouble(u'ERROR: "rental" videos not supported')
408 # Start extracting information
409 self.report_information_extraction(video_id)
412 if 'author' not in video_info:
413 self._downloader.trouble(u'ERROR: unable to extract uploader name')
415 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
418 video_uploader_id = None
419 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
421 video_uploader_id = mobj.group(1)
423 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
426 if 'title' not in video_info:
427 self._downloader.trouble(u'ERROR: unable to extract video title')
429 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
432 if 'thumbnail_url' not in video_info:
433 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
435 else: # don't panic if we can't find it
436 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
440 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
442 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
443 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
444 for expression in format_expressions:
446 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
451 video_description = get_element_by_id("eow-description", video_webpage)
452 if video_description:
453 video_description = clean_html(video_description)
455 video_description = ''
458 video_subtitles = None
459 if self._downloader.params.get('writesubtitles', False):
460 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
462 self._downloader.trouble(srt_error)
464 if 'length_seconds' not in video_info:
465 self._downloader.trouble(u'WARNING: unable to extract video duration')
468 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
471 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
473 # Decide which formats to download
474 req_format = self._downloader.params.get('format', None)
476 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
477 self.report_rtmp_download()
478 video_url_list = [(None, video_info['conn'][0])]
479 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
480 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
481 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
482 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
483 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
485 format_limit = self._downloader.params.get('format_limit', None)
486 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
487 if format_limit is not None and format_limit in available_formats:
488 format_list = available_formats[available_formats.index(format_limit):]
490 format_list = available_formats
491 existing_formats = [x for x in format_list if x in url_map]
492 if len(existing_formats) == 0:
493 self._downloader.trouble(u'ERROR: no known formats available for video')
495 if self._downloader.params.get('listformats', None):
496 self._print_formats(existing_formats)
498 if req_format is None or req_format == 'best':
499 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
500 elif req_format == 'worst':
501 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
502 elif req_format in ('-1', 'all'):
503 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
505 # Specific formats. We pick the first in a slash-delimeted sequence.
506 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
507 req_formats = req_format.split('/')
508 video_url_list = None
509 for rf in req_formats:
511 video_url_list = [(rf, url_map[rf])]
513 if video_url_list is None:
514 self._downloader.trouble(u'ERROR: requested format not available')
517 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
521 for format_param, video_real_url in video_url_list:
523 video_extension = self._video_extensions.get(format_param, 'flv')
525 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
526 self._video_dimensions.get(format_param, '???'))
530 'url': video_real_url,
531 'uploader': video_uploader,
532 'uploader_id': video_uploader_id,
533 'upload_date': upload_date,
534 'title': video_title,
535 'ext': video_extension,
536 'format': video_format,
537 'thumbnail': video_thumbnail,
538 'description': video_description,
539 'player_url': player_url,
540 'subtitles': video_subtitles,
541 'duration': video_duration
546 class MetacafeIE(InfoExtractor):
547 """Information Extractor for metacafe.com."""
549 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
550 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
551 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
552 IE_NAME = u'metacafe'
554 def __init__(self, downloader=None):
555 InfoExtractor.__init__(self, downloader)
557 def report_disclaimer(self):
558 """Report disclaimer retrieval."""
559 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
561 def report_age_confirmation(self):
562 """Report attempt to confirm age."""
563 self._downloader.to_screen(u'[metacafe] Confirming age')
565 def report_download_webpage(self, video_id):
566 """Report webpage download."""
567 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
569 def report_extraction(self, video_id):
570 """Report information extraction."""
571 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
573 def _real_initialize(self):
574 # Retrieve disclaimer
575 request = compat_urllib_request.Request(self._DISCLAIMER)
577 self.report_disclaimer()
578 disclaimer = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
586 'submit': "Continue - I'm over 18",
588 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
590 self.report_age_confirmation()
591 disclaimer = compat_urllib_request.urlopen(request).read()
592 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
593 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
596 def _real_extract(self, url):
597 # Extract id and simplified title from URL
598 mobj = re.match(self._VALID_URL, url)
600 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
603 video_id = mobj.group(1)
605 # Check if video comes from YouTube
606 mobj2 = re.match(r'^yt-(.*)$', video_id)
607 if mobj2 is not None:
608 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
611 # Retrieve video webpage to extract further information
612 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
614 self.report_download_webpage(video_id)
615 webpage = compat_urllib_request.urlopen(request).read()
616 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
617 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
620 # Extract URL, uploader and title from webpage
621 self.report_extraction(video_id)
622 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
624 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
625 video_extension = mediaURL[-3:]
627 # Extract gdaKey if available
628 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
632 gdaKey = mobj.group(1)
633 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
635 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 vardict = compat_parse_qs(mobj.group(1))
640 if 'mediaData' not in vardict:
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
645 self._downloader.trouble(u'ERROR: unable to extract media URL')
647 mediaURL = mobj.group(1).replace('\\/', '/')
648 video_extension = mediaURL[-3:]
649 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
651 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
653 self._downloader.trouble(u'ERROR: unable to extract title')
655 video_title = mobj.group(1).decode('utf-8')
657 mobj = re.search(r'submitter=(.*?);', webpage)
659 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
661 video_uploader = mobj.group(1)
664 'id': video_id.decode('utf-8'),
665 'url': video_url.decode('utf-8'),
666 'uploader': video_uploader.decode('utf-8'),
668 'title': video_title,
669 'ext': video_extension.decode('utf-8'),
673 class DailymotionIE(InfoExtractor):
674 """Information Extractor for Dailymotion"""
676 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
677 IE_NAME = u'dailymotion'
679 def __init__(self, downloader=None):
680 InfoExtractor.__init__(self, downloader)
682 def report_extraction(self, video_id):
683 """Report information extraction."""
684 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
686 def _real_extract(self, url):
687 # Extract id and simplified title from URL
688 mobj = re.match(self._VALID_URL, url)
690 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
693 video_id = mobj.group(1).split('_')[0].split('?')[0]
695 video_extension = 'mp4'
697 # Retrieve video webpage to extract further information
698 request = compat_urllib_request.Request(url)
699 request.add_header('Cookie', 'family_filter=off')
700 webpage = self._download_webpage(request, video_id)
702 # Extract URL, uploader and title from webpage
703 self.report_extraction(video_id)
704 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
706 self._downloader.trouble(u'ERROR: unable to extract media URL')
708 flashvars = compat_urllib_parse.unquote(mobj.group(1))
710 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
713 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
716 self._downloader.trouble(u'ERROR: unable to extract video URL')
719 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
721 self._downloader.trouble(u'ERROR: unable to extract video URL')
724 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
726 # TODO: support choosing qualities
728 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
730 self._downloader.trouble(u'ERROR: unable to extract title')
732 video_title = unescapeHTML(mobj.group('title'))
734 video_uploader = None
735 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
737 # lookin for official user
738 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
739 if mobj_official is None:
740 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
742 video_uploader = mobj_official.group(1)
744 video_uploader = mobj.group(1)
746 video_upload_date = None
747 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
749 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
754 'uploader': video_uploader,
755 'upload_date': video_upload_date,
756 'title': video_title,
757 'ext': video_extension,
761 class PhotobucketIE(InfoExtractor):
762 """Information extractor for photobucket.com."""
764 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
765 IE_NAME = u'photobucket'
767 def __init__(self, downloader=None):
768 InfoExtractor.__init__(self, downloader)
770 def report_download_webpage(self, video_id):
771 """Report webpage download."""
772 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
774 def report_extraction(self, video_id):
775 """Report information extraction."""
776 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
778 def _real_extract(self, url):
779 # Extract id from URL
780 mobj = re.match(self._VALID_URL, url)
782 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
785 video_id = mobj.group(1)
787 video_extension = 'flv'
789 # Retrieve video webpage to extract further information
790 request = compat_urllib_request.Request(url)
792 self.report_download_webpage(video_id)
793 webpage = compat_urllib_request.urlopen(request).read()
794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
795 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
798 # Extract URL, uploader, and title from webpage
799 self.report_extraction(video_id)
800 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
802 self._downloader.trouble(u'ERROR: unable to extract media URL')
804 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
808 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
810 self._downloader.trouble(u'ERROR: unable to extract title')
812 video_title = mobj.group(1).decode('utf-8')
814 video_uploader = mobj.group(2).decode('utf-8')
817 'id': video_id.decode('utf-8'),
818 'url': video_url.decode('utf-8'),
819 'uploader': video_uploader,
821 'title': video_title,
822 'ext': video_extension.decode('utf-8'),
826 class YahooIE(InfoExtractor):
827 """Information extractor for video.yahoo.com."""
830 # _VALID_URL matches all Yahoo! Video URLs
831 # _VPAGE_URL matches only the extractable '/watch/' URLs
832 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
833 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
834 IE_NAME = u'video.yahoo'
836 def __init__(self, downloader=None):
837 InfoExtractor.__init__(self, downloader)
839 def report_download_webpage(self, video_id):
840 """Report webpage download."""
841 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
843 def report_extraction(self, video_id):
844 """Report information extraction."""
845 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
847 def _real_extract(self, url, new_video=True):
848 # Extract ID from URL
849 mobj = re.match(self._VALID_URL, url)
851 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
854 video_id = mobj.group(2)
855 video_extension = 'flv'
857 # Rewrite valid but non-extractable URLs as
858 # extractable English language /watch/ URLs
859 if re.match(self._VPAGE_URL, url) is None:
860 request = compat_urllib_request.Request(url)
862 webpage = compat_urllib_request.urlopen(request).read()
863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
864 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
867 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
869 self._downloader.trouble(u'ERROR: Unable to extract id field')
871 yahoo_id = mobj.group(1)
873 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
875 self._downloader.trouble(u'ERROR: Unable to extract vid field')
877 yahoo_vid = mobj.group(1)
879 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
880 return self._real_extract(url, new_video=False)
882 # Retrieve video webpage to extract further information
883 request = compat_urllib_request.Request(url)
885 self.report_download_webpage(video_id)
886 webpage = compat_urllib_request.urlopen(request).read()
887 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
888 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
891 # Extract uploader and title from webpage
892 self.report_extraction(video_id)
893 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
895 self._downloader.trouble(u'ERROR: unable to extract video title')
897 video_title = mobj.group(1).decode('utf-8')
899 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
901 self._downloader.trouble(u'ERROR: unable to extract video uploader')
903 video_uploader = mobj.group(1).decode('utf-8')
905 # Extract video thumbnail
906 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
908 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
910 video_thumbnail = mobj.group(1).decode('utf-8')
912 # Extract video description
913 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
915 self._downloader.trouble(u'ERROR: unable to extract video description')
917 video_description = mobj.group(1).decode('utf-8')
918 if not video_description:
919 video_description = 'No description available.'
921 # Extract video height and width
922 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
924 self._downloader.trouble(u'ERROR: unable to extract video height')
926 yv_video_height = mobj.group(1)
928 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
930 self._downloader.trouble(u'ERROR: unable to extract video width')
932 yv_video_width = mobj.group(1)
934 # Retrieve video playlist to extract media URL
935 # I'm not completely sure what all these options are, but we
936 # seem to need most of them, otherwise the server sends a 401.
937 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
938 yv_bitrate = '700' # according to Wikipedia this is hard-coded
939 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
940 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
941 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
943 self.report_download_webpage(video_id)
944 webpage = compat_urllib_request.urlopen(request).read()
945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
946 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
949 # Extract media URL from playlist XML
950 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
952 self._downloader.trouble(u'ERROR: Unable to extract media URL')
954 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
955 video_url = unescapeHTML(video_url)
958 'id': video_id.decode('utf-8'),
960 'uploader': video_uploader,
962 'title': video_title,
963 'ext': video_extension.decode('utf-8'),
964 'thumbnail': video_thumbnail.decode('utf-8'),
965 'description': video_description,
969 class VimeoIE(InfoExtractor):
970 """Information extractor for vimeo.com."""
972 # _VALID_URL matches Vimeo URLs
973 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
976 def __init__(self, downloader=None):
977 InfoExtractor.__init__(self, downloader)
979 def report_download_webpage(self, video_id):
980 """Report webpage download."""
981 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
983 def report_extraction(self, video_id):
984 """Report information extraction."""
985 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
987 def _real_extract(self, url, new_video=True):
988 # Extract ID from URL
989 mobj = re.match(self._VALID_URL, url)
991 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
994 video_id = mobj.group(1)
996 # Retrieve video webpage to extract further information
997 request = compat_urllib_request.Request(url, None, std_headers)
999 self.report_download_webpage(video_id)
1000 webpage_bytes = compat_urllib_request.urlopen(request).read()
1001 webpage = webpage_bytes.decode('utf-8')
1002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1003 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1006 # Now we begin extracting as much information as we can from what we
1007 # retrieved. First we extract the information common to all extractors,
1008 # and latter we extract those that are Vimeo specific.
1009 self.report_extraction(video_id)
1011 # Extract the config JSON
1013 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1014 config = json.loads(config)
1016 self._downloader.trouble(u'ERROR: unable to extract info section')
1020 video_title = config["video"]["title"]
1022 # Extract uploader and uploader_id
1023 video_uploader = config["video"]["owner"]["name"]
1024 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026 # Extract video thumbnail
1027 video_thumbnail = config["video"]["thumbnail"]
1029 # Extract video description
1030 video_description = get_element_by_attribute("itemprop", "description", webpage)
1031 if video_description: video_description = clean_html(video_description)
1032 else: video_description = ''
1034 # Extract upload date
1035 video_upload_date = None
1036 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1037 if mobj is not None:
1038 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040 # Vimeo specific: extract request signature and timestamp
1041 sig = config['request']['signature']
1042 timestamp = config['request']['timestamp']
1044 # Vimeo specific: extract video codec and quality information
1045 # First consider quality, then codecs, then take everything
1046 # TODO bind to format param
1047 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1048 files = { 'hd': [], 'sd': [], 'other': []}
1049 for codec_name, codec_extension in codecs:
1050 if codec_name in config["video"]["files"]:
1051 if 'hd' in config["video"]["files"][codec_name]:
1052 files['hd'].append((codec_name, codec_extension, 'hd'))
1053 elif 'sd' in config["video"]["files"][codec_name]:
1054 files['sd'].append((codec_name, codec_extension, 'sd'))
1056 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058 for quality in ('hd', 'sd', 'other'):
1059 if len(files[quality]) > 0:
1060 video_quality = files[quality][0][2]
1061 video_codec = files[quality][0][0]
1062 video_extension = files[quality][0][1]
1063 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1066 self._downloader.trouble(u'ERROR: no known codec found')
1069 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1075 'uploader': video_uploader,
1076 'uploader_id': video_uploader_id,
1077 'upload_date': video_upload_date,
1078 'title': video_title,
1079 'ext': video_extension,
1080 'thumbnail': video_thumbnail,
1081 'description': video_description,
1085 class ArteTvIE(InfoExtractor):
1086 """arte.tv information extractor."""
1088 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089 _LIVE_URL = r'index-[0-9]+\.html$'
1091 IE_NAME = u'arte.tv'
1093 def __init__(self, downloader=None):
1094 InfoExtractor.__init__(self, downloader)
1096 def report_download_webpage(self, video_id):
1097 """Report webpage download."""
1098 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1100 def report_extraction(self, video_id):
1101 """Report information extraction."""
1102 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1104 def fetch_webpage(self, url):
1105 request = compat_urllib_request.Request(url)
1107 self.report_download_webpage(url)
1108 webpage = compat_urllib_request.urlopen(request).read()
1109 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1110 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1112 except ValueError as err:
1113 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1117 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1118 page = self.fetch_webpage(url)
1119 mobj = re.search(regex, page, regexFlags)
1123 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1126 for (i, key, err) in matchTuples:
1127 if mobj.group(i) is None:
1128 self._downloader.trouble(err)
1131 info[key] = mobj.group(i)
1135 def extractLiveStream(self, url):
1136 video_lang = url.split('/')[-4]
1137 info = self.grep_webpage(
1139 r'src="(.*?/videothek_js.*?\.js)',
1142 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1145 http_host = url.split('/')[2]
1146 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1147 info = self.grep_webpage(
1149 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1150 '(http://.*?\.swf).*?' +
1154 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1155 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1156 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1159 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1161 def extractPlus7Stream(self, url):
1162 video_lang = url.split('/')[-3]
1163 info = self.grep_webpage(
1165 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1168 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1171 next_url = compat_urllib_parse.unquote(info.get('url'))
1172 info = self.grep_webpage(
1174 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1177 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1180 next_url = compat_urllib_parse.unquote(info.get('url'))
1182 info = self.grep_webpage(
1184 r'<video id="(.*?)".*?>.*?' +
1185 '<name>(.*?)</name>.*?' +
1186 '<dateVideo>(.*?)</dateVideo>.*?' +
1187 '<url quality="hd">(.*?)</url>',
1190 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1191 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1192 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1193 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1198 'id': info.get('id'),
1199 'url': compat_urllib_parse.unquote(info.get('url')),
1200 'uploader': u'arte.tv',
1201 'upload_date': info.get('date'),
1202 'title': info.get('title').decode('utf-8'),
1208 def _real_extract(self, url):
1209 video_id = url.split('/')[-1]
1210 self.report_extraction(video_id)
1212 if re.search(self._LIVE_URL, video_id) is not None:
1213 self.extractLiveStream(url)
1216 info = self.extractPlus7Stream(url)
1221 class GenericIE(InfoExtractor):
1222 """Generic last-resort information extractor."""
1225 IE_NAME = u'generic'
1227 def __init__(self, downloader=None):
1228 InfoExtractor.__init__(self, downloader)
1230 def report_download_webpage(self, video_id):
1231 """Report webpage download."""
1232 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1233 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1235 def report_extraction(self, video_id):
1236 """Report information extraction."""
1237 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1239 def report_following_redirect(self, new_url):
1240 """Report information extraction."""
1241 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1243 def _test_redirect(self, url):
1244 """Check if it is a redirect, like url shorteners, in case restart chain."""
1245 class HeadRequest(compat_urllib_request.Request):
1246 def get_method(self):
1249 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1251 Subclass the HTTPRedirectHandler to make it use our
1252 HeadRequest also on the redirected URL
1254 def redirect_request(self, req, fp, code, msg, headers, newurl):
1255 if code in (301, 302, 303, 307):
1256 newurl = newurl.replace(' ', '%20')
1257 newheaders = dict((k,v) for k,v in req.headers.items()
1258 if k.lower() not in ("content-length", "content-type"))
1259 return HeadRequest(newurl,
1261 origin_req_host=req.get_origin_req_host(),
1264 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1266 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1268 Fallback to GET if HEAD is not allowed (405 HTTP error)
1270 def http_error_405(self, req, fp, code, msg, headers):
1274 newheaders = dict((k,v) for k,v in req.headers.items()
1275 if k.lower() not in ("content-length", "content-type"))
1276 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1278 origin_req_host=req.get_origin_req_host(),
1282 opener = compat_urllib_request.OpenerDirector()
1283 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1284 HTTPMethodFallback, HEADRedirectHandler,
1285 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1286 opener.add_handler(handler())
1288 response = opener.open(HeadRequest(url))
1289 new_url = response.geturl()
1294 self.report_following_redirect(new_url)
1295 self._downloader.download([new_url])
1298 def _real_extract(self, url):
1299 if self._test_redirect(url): return
1301 video_id = url.split('/')[-1]
1302 request = compat_urllib_request.Request(url)
1304 self.report_download_webpage(video_id)
1305 webpage = compat_urllib_request.urlopen(request).read()
1306 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1307 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1309 except ValueError as err:
1310 # since this is the last-resort InfoExtractor, if
1311 # this error is thrown, it'll be thrown here
1312 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1315 self.report_extraction(video_id)
1316 # Start with something easy: JW Player in SWFObject
1317 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1319 # Broaden the search a little bit
1320 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1322 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1325 # It's possible that one of the regexes
1326 # matched, but returned an empty group:
1327 if mobj.group(1) is None:
1328 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1331 video_url = compat_urllib_parse.unquote(mobj.group(1))
1332 video_id = os.path.basename(video_url)
1334 # here's a fun little line of code for you:
1335 video_extension = os.path.splitext(video_id)[1][1:]
1336 video_id = os.path.splitext(video_id)[0]
1338 # it's tempting to parse this further, but you would
1339 # have to take into account all the variations like
1340 # Video Title - Site Name
1341 # Site Name | Video Title
1342 # Video Title - Tagline | Site Name
1343 # and so on and so forth; it's just not practical
1344 mobj = re.search(r'<title>(.*)</title>', webpage)
1346 self._downloader.trouble(u'ERROR: unable to extract title')
1348 video_title = mobj.group(1)
1350 # video uploader is domain name
1351 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1353 self._downloader.trouble(u'ERROR: unable to extract title')
1355 video_uploader = mobj.group(1)
1360 'uploader': video_uploader,
1361 'upload_date': None,
1362 'title': video_title,
1363 'ext': video_extension,
1367 class YoutubeSearchIE(InfoExtractor):
1368 """Information Extractor for YouTube search queries."""
1369 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1370 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1371 _max_youtube_results = 1000
1372 IE_NAME = u'youtube:search'
1374 def __init__(self, downloader=None):
1375 InfoExtractor.__init__(self, downloader)
1377 def report_download_page(self, query, pagenum):
1378 """Report attempt to download search page with given number."""
1379 query = query.decode(preferredencoding())
1380 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1382 def _real_extract(self, query):
1383 mobj = re.match(self._VALID_URL, query)
1385 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1388 prefix, query = query.split(':')
1390 query = query.encode('utf-8')
1392 self._download_n_results(query, 1)
1394 elif prefix == 'all':
1395 self._download_n_results(query, self._max_youtube_results)
1401 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1403 elif n > self._max_youtube_results:
1404 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1405 n = self._max_youtube_results
1406 self._download_n_results(query, n)
1408 except ValueError: # parsing prefix as integer fails
1409 self._download_n_results(query, 1)
1412 def _download_n_results(self, query, n):
1413 """Downloads a specified number of results for a query"""
1419 while (50 * pagenum) < limit:
1420 self.report_download_page(query, pagenum+1)
1421 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1422 request = compat_urllib_request.Request(result_url)
1424 data = compat_urllib_request.urlopen(request).read()
1425 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1426 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1428 api_response = json.loads(data)['data']
1430 new_ids = list(video['id'] for video in api_response['items'])
1431 video_ids += new_ids
1433 limit = min(n, api_response['totalItems'])
1436 if len(video_ids) > n:
1437 video_ids = video_ids[:n]
1438 for id in video_ids:
1439 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1443 class GoogleSearchIE(InfoExtractor):
1444 """Information Extractor for Google Video search queries."""
1445 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1446 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1447 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1448 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1449 _max_google_results = 1000
1450 IE_NAME = u'video.google:search'
1452 def __init__(self, downloader=None):
1453 InfoExtractor.__init__(self, downloader)
1455 def report_download_page(self, query, pagenum):
1456 """Report attempt to download playlist page with given number."""
1457 query = query.decode(preferredencoding())
1458 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1460 def _real_extract(self, query):
1461 mobj = re.match(self._VALID_URL, query)
1463 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1466 prefix, query = query.split(':')
1468 query = query.encode('utf-8')
1470 self._download_n_results(query, 1)
1472 elif prefix == 'all':
1473 self._download_n_results(query, self._max_google_results)
1479 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1481 elif n > self._max_google_results:
1482 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1483 n = self._max_google_results
1484 self._download_n_results(query, n)
1486 except ValueError: # parsing prefix as integer fails
1487 self._download_n_results(query, 1)
1490 def _download_n_results(self, query, n):
1491 """Downloads a specified number of results for a query"""
1497 self.report_download_page(query, pagenum)
1498 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1499 request = compat_urllib_request.Request(result_url)
1501 page = compat_urllib_request.urlopen(request).read()
1502 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1503 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1506 # Extract video identifiers
1507 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1508 video_id = mobj.group(1)
1509 if video_id not in video_ids:
1510 video_ids.append(video_id)
1511 if len(video_ids) == n:
1512 # Specified n videos reached
1513 for id in video_ids:
1514 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1517 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1518 for id in video_ids:
1519 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1522 pagenum = pagenum + 1
1525 class YahooSearchIE(InfoExtractor):
1526 """Information Extractor for Yahoo! Video search queries."""
1529 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1530 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1531 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1532 _MORE_PAGES_INDICATOR = r'\s*Next'
1533 _max_yahoo_results = 1000
1534 IE_NAME = u'video.yahoo:search'
1536 def __init__(self, downloader=None):
1537 InfoExtractor.__init__(self, downloader)
1539 def report_download_page(self, query, pagenum):
1540 """Report attempt to download playlist page with given number."""
1541 query = query.decode(preferredencoding())
1542 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1544 def _real_extract(self, query):
1545 mobj = re.match(self._VALID_URL, query)
1547 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1550 prefix, query = query.split(':')
1552 query = query.encode('utf-8')
1554 self._download_n_results(query, 1)
1556 elif prefix == 'all':
1557 self._download_n_results(query, self._max_yahoo_results)
1563 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1565 elif n > self._max_yahoo_results:
1566 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1567 n = self._max_yahoo_results
1568 self._download_n_results(query, n)
1570 except ValueError: # parsing prefix as integer fails
1571 self._download_n_results(query, 1)
1574 def _download_n_results(self, query, n):
1575 """Downloads a specified number of results for a query"""
1578 already_seen = set()
1582 self.report_download_page(query, pagenum)
1583 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1584 request = compat_urllib_request.Request(result_url)
1586 page = compat_urllib_request.urlopen(request).read()
1587 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1588 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1591 # Extract video identifiers
1592 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593 video_id = mobj.group(1)
1594 if video_id not in already_seen:
1595 video_ids.append(video_id)
1596 already_seen.add(video_id)
1597 if len(video_ids) == n:
1598 # Specified n videos reached
1599 for id in video_ids:
1600 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1603 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1604 for id in video_ids:
1605 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608 pagenum = pagenum + 1
1611 class YoutubePlaylistIE(InfoExtractor):
1612 """Information Extractor for YouTube playlists."""
1614 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1615 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1616 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1617 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1618 IE_NAME = u'youtube:playlist'
1620 def __init__(self, downloader=None):
1621 InfoExtractor.__init__(self, downloader)
1623 def report_download_page(self, playlist_id, pagenum):
1624 """Report attempt to download playlist page with given number."""
1625 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1627 def _real_extract(self, url):
1628 # Extract playlist id
1629 mobj = re.match(self._VALID_URL, url)
1631 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1635 if mobj.group(3) is not None:
1636 self._downloader.download([mobj.group(3)])
1639 # Download playlist pages
1640 # prefix is 'p' as default for playlists but there are other types that need extra care
1641 playlist_prefix = mobj.group(1)
1642 if playlist_prefix == 'a':
1643 playlist_access = 'artist'
1645 playlist_prefix = 'p'
1646 playlist_access = 'view_play_list'
1647 playlist_id = mobj.group(2)
1652 self.report_download_page(playlist_id, pagenum)
1653 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1654 request = compat_urllib_request.Request(url)
1656 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1658 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1661 # Extract video identifiers
1663 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1664 if mobj.group(1) not in ids_in_page:
1665 ids_in_page.append(mobj.group(1))
1666 video_ids.extend(ids_in_page)
1668 if self._MORE_PAGES_INDICATOR not in page:
1670 pagenum = pagenum + 1
1672 total = len(video_ids)
1674 playliststart = self._downloader.params.get('playliststart', 1) - 1
1675 playlistend = self._downloader.params.get('playlistend', -1)
1676 if playlistend == -1:
1677 video_ids = video_ids[playliststart:]
1679 video_ids = video_ids[playliststart:playlistend]
1681 if len(video_ids) == total:
1682 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1684 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1686 for id in video_ids:
1687 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1691 class YoutubeChannelIE(InfoExtractor):
1692 """Information Extractor for YouTube channels."""
1694 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1695 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1696 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1697 IE_NAME = u'youtube:channel'
1699 def report_download_page(self, channel_id, pagenum):
1700 """Report attempt to download channel page with given number."""
1701 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1703 def _real_extract(self, url):
1704 # Extract channel id
1705 mobj = re.match(self._VALID_URL, url)
1707 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1710 # Download channel pages
1711 channel_id = mobj.group(1)
1716 self.report_download_page(channel_id, pagenum)
1717 url = self._TEMPLATE_URL % (channel_id, pagenum)
1718 request = compat_urllib_request.Request(url)
1720 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1721 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1722 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1725 # Extract video identifiers
1727 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1728 if mobj.group(1) not in ids_in_page:
1729 ids_in_page.append(mobj.group(1))
1730 video_ids.extend(ids_in_page)
1732 if self._MORE_PAGES_INDICATOR not in page:
1734 pagenum = pagenum + 1
1736 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1738 for id in video_ids:
1739 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1743 class YoutubeUserIE(InfoExtractor):
1744 """Information Extractor for YouTube users."""
1746 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1747 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1748 _GDATA_PAGE_SIZE = 50
1749 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1750 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1751 IE_NAME = u'youtube:user'
1753 def __init__(self, downloader=None):
1754 InfoExtractor.__init__(self, downloader)
1756 def report_download_page(self, username, start_index):
1757 """Report attempt to download user page."""
1758 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1759 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1761 def _real_extract(self, url):
1763 mobj = re.match(self._VALID_URL, url)
1765 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1768 username = mobj.group(1)
1770 # Download video ids using YouTube Data API. Result size per
1771 # query is limited (currently to 50 videos) so we need to query
1772 # page by page until there are no video ids - it means we got
1779 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1780 self.report_download_page(username, start_index)
1782 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1785 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1790 # Extract video identifiers
1793 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1794 if mobj.group(1) not in ids_in_page:
1795 ids_in_page.append(mobj.group(1))
1797 video_ids.extend(ids_in_page)
1799 # A little optimization - if current page is not
1800 # "full", ie. does not contain PAGE_SIZE video ids then
1801 # we can assume that this page is the last one - there
1802 # are no more ids on further pages - no need to query
1805 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1810 all_ids_count = len(video_ids)
1811 playliststart = self._downloader.params.get('playliststart', 1) - 1
1812 playlistend = self._downloader.params.get('playlistend', -1)
1814 if playlistend == -1:
1815 video_ids = video_ids[playliststart:]
1817 video_ids = video_ids[playliststart:playlistend]
1819 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1820 (username, all_ids_count, len(video_ids)))
1822 for video_id in video_ids:
1823 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1826 class BlipTVUserIE(InfoExtractor):
1827 """Information Extractor for blip.tv users."""
1829 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1831 IE_NAME = u'blip.tv:user'
1833 def __init__(self, downloader=None):
1834 InfoExtractor.__init__(self, downloader)
1836 def report_download_page(self, username, pagenum):
1837 """Report attempt to download user page."""
1838 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1839 (self.IE_NAME, username, pagenum))
1841 def _real_extract(self, url):
1843 mobj = re.match(self._VALID_URL, url)
1845 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1848 username = mobj.group(1)
1850 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1852 request = compat_urllib_request.Request(url)
1855 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1856 mobj = re.search(r'data-users-id="([^"]+)"', page)
1857 page_base = page_base % mobj.group(1)
1858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1863 # Download video ids using BlipTV Ajax calls. Result size per
1864 # query is limited (currently to 12 videos) so we need to query
1865 # page by page until there are no video ids - it means we got
1872 self.report_download_page(username, pagenum)
1874 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1877 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1879 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1882 # Extract video identifiers
1885 for mobj in re.finditer(r'href="/([^"]+)"', page):
1886 if mobj.group(1) not in ids_in_page:
1887 ids_in_page.append(unescapeHTML(mobj.group(1)))
1889 video_ids.extend(ids_in_page)
1891 # A little optimization - if current page is not
1892 # "full", ie. does not contain PAGE_SIZE video ids then
1893 # we can assume that this page is the last one - there
1894 # are no more ids on further pages - no need to query
1897 if len(ids_in_page) < self._PAGE_SIZE:
1902 all_ids_count = len(video_ids)
1903 playliststart = self._downloader.params.get('playliststart', 1) - 1
1904 playlistend = self._downloader.params.get('playlistend', -1)
1906 if playlistend == -1:
1907 video_ids = video_ids[playliststart:]
1909 video_ids = video_ids[playliststart:playlistend]
1911 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1912 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1914 for video_id in video_ids:
1915 self._downloader.download([u'http://blip.tv/'+video_id])
1918 class DepositFilesIE(InfoExtractor):
1919 """Information extractor for depositfiles.com"""
1921 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1923 def report_download_webpage(self, file_id):
1924 """Report webpage download."""
1925 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1927 def report_extraction(self, file_id):
1928 """Report information extraction."""
1929 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1931 def _real_extract(self, url):
1932 file_id = url.split('/')[-1]
1933 # Rebuild url in english locale
1934 url = 'http://depositfiles.com/en/files/' + file_id
1936 # Retrieve file webpage with 'Free download' button pressed
1937 free_download_indication = { 'gateway_result' : '1' }
1938 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1940 self.report_download_webpage(file_id)
1941 webpage = compat_urllib_request.urlopen(request).read()
1942 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1943 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1946 # Search for the real file URL
1947 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1948 if (mobj is None) or (mobj.group(1) is None):
1949 # Try to figure out reason of the error.
1950 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1951 if (mobj is not None) and (mobj.group(1) is not None):
1952 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1953 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1955 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1958 file_url = mobj.group(1)
1959 file_extension = os.path.splitext(file_url)[1][1:]
1961 # Search for file title
1962 mobj = re.search(r'<b title="(.*?)">', webpage)
1964 self._downloader.trouble(u'ERROR: unable to extract title')
1966 file_title = mobj.group(1).decode('utf-8')
1969 'id': file_id.decode('utf-8'),
1970 'url': file_url.decode('utf-8'),
1972 'upload_date': None,
1973 'title': file_title,
1974 'ext': file_extension.decode('utf-8'),
1978 class FacebookIE(InfoExtractor):
1979 """Information Extractor for Facebook"""
1982 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1983 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1984 _NETRC_MACHINE = 'facebook'
1985 _available_formats = ['video', 'highqual', 'lowqual']
1986 _video_extensions = {
1991 IE_NAME = u'facebook'
1993 def __init__(self, downloader=None):
1994 InfoExtractor.__init__(self, downloader)
1996 def _reporter(self, message):
1997 """Add header and report message."""
1998 self._downloader.to_screen(u'[facebook] %s' % message)
2000 def report_login(self):
2001 """Report attempt to log in."""
2002 self._reporter(u'Logging in')
2004 def report_video_webpage_download(self, video_id):
2005 """Report attempt to download video webpage."""
2006 self._reporter(u'%s: Downloading video webpage' % video_id)
2008 def report_information_extraction(self, video_id):
2009 """Report attempt to extract video information."""
2010 self._reporter(u'%s: Extracting video information' % video_id)
2012 def _parse_page(self, video_webpage):
2013 """Extract video information from page"""
2015 data = {'title': r'\("video_title", "(.*?)"\)',
2016 'description': r'<div class="datawrap">(.*?)</div>',
2017 'owner': r'\("video_owner_name", "(.*?)"\)',
2018 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2021 for piece in data.keys():
2022 mobj = re.search(data[piece], video_webpage)
2023 if mobj is not None:
2024 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2028 for fmt in self._available_formats:
2029 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2030 if mobj is not None:
2031 # URL is in a Javascript segment inside an escaped Unicode format within
2032 # the generally utf-8 page
2033 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2034 video_info['video_urls'] = video_urls
2038 def _real_initialize(self):
2039 if self._downloader is None:
2044 downloader_params = self._downloader.params
2046 # Attempt to use provided username and password or .netrc data
2047 if downloader_params.get('username', None) is not None:
2048 useremail = downloader_params['username']
2049 password = downloader_params['password']
2050 elif downloader_params.get('usenetrc', False):
2052 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2053 if info is not None:
2057 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2058 except (IOError, netrc.NetrcParseError) as err:
2059 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2062 if useremail is None:
2071 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2074 login_results = compat_urllib_request.urlopen(request).read()
2075 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2076 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2078 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2079 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2082 def _real_extract(self, url):
2083 mobj = re.match(self._VALID_URL, url)
2085 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2087 video_id = mobj.group('ID')
2090 self.report_video_webpage_download(video_id)
2091 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2093 page = compat_urllib_request.urlopen(request)
2094 video_webpage = page.read()
2095 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2096 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2099 # Start extracting information
2100 self.report_information_extraction(video_id)
2102 # Extract information
2103 video_info = self._parse_page(video_webpage)
2106 if 'owner' not in video_info:
2107 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2109 video_uploader = video_info['owner']
2112 if 'title' not in video_info:
2113 self._downloader.trouble(u'ERROR: unable to extract video title')
2115 video_title = video_info['title']
2116 video_title = video_title.decode('utf-8')
2119 if 'thumbnail' not in video_info:
2120 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2121 video_thumbnail = ''
2123 video_thumbnail = video_info['thumbnail']
2127 if 'upload_date' in video_info:
2128 upload_time = video_info['upload_date']
2129 timetuple = email.utils.parsedate_tz(upload_time)
2130 if timetuple is not None:
2132 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2137 video_description = video_info.get('description', 'No description available.')
2139 url_map = video_info['video_urls']
2141 # Decide which formats to download
2142 req_format = self._downloader.params.get('format', None)
2143 format_limit = self._downloader.params.get('format_limit', None)
2145 if format_limit is not None and format_limit in self._available_formats:
2146 format_list = self._available_formats[self._available_formats.index(format_limit):]
2148 format_list = self._available_formats
2149 existing_formats = [x for x in format_list if x in url_map]
2150 if len(existing_formats) == 0:
2151 self._downloader.trouble(u'ERROR: no known formats available for video')
2153 if req_format is None:
2154 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2155 elif req_format == 'worst':
2156 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2157 elif req_format == '-1':
2158 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2161 if req_format not in url_map:
2162 self._downloader.trouble(u'ERROR: requested format not available')
2164 video_url_list = [(req_format, url_map[req_format])] # Specific format
2167 for format_param, video_real_url in video_url_list:
2169 video_extension = self._video_extensions.get(format_param, 'mp4')
2172 'id': video_id.decode('utf-8'),
2173 'url': video_real_url.decode('utf-8'),
2174 'uploader': video_uploader.decode('utf-8'),
2175 'upload_date': upload_date,
2176 'title': video_title,
2177 'ext': video_extension.decode('utf-8'),
2178 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2179 'thumbnail': video_thumbnail.decode('utf-8'),
2180 'description': video_description.decode('utf-8'),
2184 class BlipTVIE(InfoExtractor):
2185 """Information extractor for blip.tv"""
2187 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2188 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2189 IE_NAME = u'blip.tv'
2191 def report_extraction(self, file_id):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2195 def report_direct_download(self, title):
2196 """Report information extraction."""
2197 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2199 def _real_extract(self, url):
2200 mobj = re.match(self._VALID_URL, url)
2202 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2209 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2210 request = compat_urllib_request.Request(json_url)
2211 self.report_extraction(mobj.group(1))
2214 urlh = compat_urllib_request.urlopen(request)
2215 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2216 basename = url.split('/')[-1]
2217 title,ext = os.path.splitext(basename)
2218 title = title.decode('UTF-8')
2219 ext = ext.replace('.', '')
2220 self.report_direct_download(title)
2225 'upload_date': None,
2230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2233 if info is None: # Regular URL
2235 json_code_bytes = urlh.read()
2236 json_code = json_code_bytes.decode('utf-8')
2237 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2238 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2242 json_data = json.loads(json_code)
2243 if 'Post' in json_data:
2244 data = json_data['Post']
2248 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2249 video_url = data['media']['url']
2250 umobj = re.match(self._URL_EXT, video_url)
2252 raise ValueError('Can not determine filename extension')
2253 ext = umobj.group(1)
2256 'id': data['item_id'],
2258 'uploader': data['display_name'],
2259 'upload_date': upload_date,
2260 'title': data['title'],
2262 'format': data['media']['mimeType'],
2263 'thumbnail': data['thumbnailUrl'],
2264 'description': data['description'],
2265 'player_url': data['embedUrl']
2267 except (ValueError,KeyError) as err:
2268 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2271 std_headers['User-Agent'] = 'iTunes/10.6.1'
2275 class MyVideoIE(InfoExtractor):
2276 """Information Extractor for myvideo.de."""
2278 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2279 IE_NAME = u'myvideo'
2281 def __init__(self, downloader=None):
2282 InfoExtractor.__init__(self, downloader)
2284 def report_extraction(self, video_id):
2285 """Report information extraction."""
2286 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2288 def _real_extract(self,url):
2289 mobj = re.match(self._VALID_URL, url)
2291 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2294 video_id = mobj.group(1)
2297 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2298 webpage = self._download_webpage(webpage_url, video_id)
2300 self.report_extraction(video_id)
2301 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2304 self._downloader.trouble(u'ERROR: unable to extract media URL')
2306 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2308 mobj = re.search('<title>([^<]+)</title>', webpage)
2310 self._downloader.trouble(u'ERROR: unable to extract title')
2313 video_title = mobj.group(1)
2319 'upload_date': None,
2320 'title': video_title,
2324 class ComedyCentralIE(InfoExtractor):
2325 """Information extractor for The Daily Show and Colbert Report """
2327 # urls can be abbreviations like :thedailyshow or :colbert
2328 # urls for episodes like:
2329 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2330 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2331 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2332 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2333 |(https?://)?(www\.)?
2334 (?P<showname>thedailyshow|colbertnation)\.com/
2335 (full-episodes/(?P<episode>.*)|
2337 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2338 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2340 IE_NAME = u'comedycentral'
2342 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2344 _video_extensions = {
2352 _video_dimensions = {
2361 def suitable(self, url):
2362 """Receives a URL and returns True if suitable for this IE."""
2363 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2365 def report_extraction(self, episode_id):
2366 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2368 def report_config_download(self, episode_id):
2369 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2371 def report_index_download(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2374 def report_player_url(self, episode_id):
2375 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2378 def _print_formats(self, formats):
2379 print('Available formats:')
2381 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2384 def _real_extract(self, url):
2385 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2390 if mobj.group('shortname'):
2391 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2392 url = u'http://www.thedailyshow.com/full-episodes/'
2394 url = u'http://www.colbertnation.com/full-episodes/'
2395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2396 assert mobj is not None
2398 if mobj.group('clip'):
2399 if mobj.group('showname') == 'thedailyshow':
2400 epTitle = mobj.group('tdstitle')
2402 epTitle = mobj.group('cntitle')
2405 dlNewest = not mobj.group('episode')
2407 epTitle = mobj.group('showname')
2409 epTitle = mobj.group('episode')
2411 req = compat_urllib_request.Request(url)
2412 self.report_extraction(epTitle)
2414 htmlHandle = compat_urllib_request.urlopen(req)
2415 html = htmlHandle.read()
2416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2420 url = htmlHandle.geturl()
2421 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2423 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2425 if mobj.group('episode') == '':
2426 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2428 epTitle = mobj.group('episode')
2430 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2432 if len(mMovieParams) == 0:
2433 # The Colbert Report embeds the information in a without
2434 # a URL prefix; so extract the alternate reference
2435 # and then add the URL prefix manually.
2437 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2438 if len(altMovieParams) == 0:
2439 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2442 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2444 playerUrl_raw = mMovieParams[0][0]
2445 self.report_player_url(epTitle)
2447 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2448 playerUrl = urlHandle.geturl()
2449 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2453 uri = mMovieParams[0][1]
2454 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2455 self.report_index_download(epTitle)
2457 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2459 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2464 idoc = xml.etree.ElementTree.fromstring(indexXml)
2465 itemEls = idoc.findall('.//item')
2466 for itemEl in itemEls:
2467 mediaId = itemEl.findall('./guid')[0].text
2468 shortMediaId = mediaId.split(':')[-1]
2469 showId = mediaId.split(':')[-2].replace('.com', '')
2470 officialTitle = itemEl.findall('./title')[0].text
2471 officialDate = itemEl.findall('./pubDate')[0].text
2473 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2474 compat_urllib_parse.urlencode({'uri': mediaId}))
2475 configReq = compat_urllib_request.Request(configUrl)
2476 self.report_config_download(epTitle)
2478 configXml = compat_urllib_request.urlopen(configReq).read()
2479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2483 cdoc = xml.etree.ElementTree.fromstring(configXml)
2485 for rendition in cdoc.findall('.//rendition'):
2486 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2490 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2493 if self._downloader.params.get('listformats', None):
2494 self._print_formats([i[0] for i in turls])
2497 # For now, just pick the highest bitrate
2498 format,video_url = turls[-1]
2500 # Get the format arg from the arg stream
2501 req_format = self._downloader.params.get('format', None)
2503 # Select format if we can find one
2506 format, video_url = f, v
2509 # Patch to download from alternative CDN, which does not
2510 # break on current RTMPDump builds
2511 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2512 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2514 if video_url.startswith(broken_cdn):
2515 video_url = video_url.replace(broken_cdn, better_cdn)
2517 effTitle = showId + u'-' + epTitle
2522 'upload_date': officialDate,
2527 'description': officialTitle,
2528 'player_url': None #playerUrl
2531 results.append(info)
2536 class EscapistIE(InfoExtractor):
2537 """Information extractor for The Escapist """
2539 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2540 IE_NAME = u'escapist'
2542 def report_extraction(self, showName):
2543 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2545 def report_config_download(self, showName):
2546 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2551 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2553 showName = mobj.group('showname')
2554 videoId = mobj.group('episode')
2556 self.report_extraction(showName)
2558 webPage = compat_urllib_request.urlopen(url)
2559 webPageBytes = webPage.read()
2560 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2561 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2566 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2567 description = unescapeHTML(descMatch.group(1))
2568 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2569 imgUrl = unescapeHTML(imgMatch.group(1))
2570 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2571 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2572 configUrlMatch = re.search('config=(.*)$', playerUrl)
2573 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2575 self.report_config_download(showName)
2577 configJSON = compat_urllib_request.urlopen(configUrl)
2578 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2579 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2584 # Technically, it's JavaScript, not JSON
2585 configJSON = configJSON.replace("'", '"')
2588 config = json.loads(configJSON)
2589 except (ValueError,) as err:
2590 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2593 playlist = config['playlist']
2594 videoUrl = playlist[1]['url']
2599 'uploader': showName,
2600 'upload_date': None,
2603 'thumbnail': imgUrl,
2604 'description': description,
2605 'player_url': playerUrl,
2611 class CollegeHumorIE(InfoExtractor):
2612 """Information extractor for collegehumor.com"""
2615 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2616 IE_NAME = u'collegehumor'
2618 def report_manifest(self, video_id):
2619 """Report information extraction."""
2620 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2622 def report_extraction(self, video_id):
2623 """Report information extraction."""
2624 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2626 def _real_extract(self, url):
2627 mobj = re.match(self._VALID_URL, url)
2629 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2631 video_id = mobj.group('videoid')
2636 'upload_date': None,
2639 self.report_extraction(video_id)
2640 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2642 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2644 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2647 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2649 videoNode = mdoc.findall('./video')[0]
2650 info['description'] = videoNode.findall('./description')[0].text
2651 info['title'] = videoNode.findall('./caption')[0].text
2652 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2653 manifest_url = videoNode.findall('./file')[0].text
2655 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2658 manifest_url += '?hdcore=2.10.3'
2659 self.report_manifest(video_id)
2661 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2663 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2666 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2668 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2669 node_id = media_node.attrib['url']
2670 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2671 except IndexError as err:
2672 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2675 url_pr = compat_urllib_parse_urlparse(manifest_url)
2676 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2683 class XVideosIE(InfoExtractor):
2684 """Information extractor for xvideos.com"""
2686 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2687 IE_NAME = u'xvideos'
2689 def report_extraction(self, video_id):
2690 """Report information extraction."""
2691 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2693 def _real_extract(self, url):
2694 mobj = re.match(self._VALID_URL, url)
2696 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2698 video_id = mobj.group(1)
2700 webpage = self._download_webpage(url, video_id)
2702 self.report_extraction(video_id)
2706 mobj = re.search(r'flv_url=(.+?)&', webpage)
2708 self._downloader.trouble(u'ERROR: unable to extract video url')
2710 video_url = compat_urllib_parse.unquote(mobj.group(1))
2714 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2716 self._downloader.trouble(u'ERROR: unable to extract video title')
2718 video_title = mobj.group(1)
2721 # Extract video thumbnail
2722 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2724 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2726 video_thumbnail = mobj.group(0)
2732 'upload_date': None,
2733 'title': video_title,
2735 'thumbnail': video_thumbnail,
2736 'description': None,
2742 class SoundcloudIE(InfoExtractor):
2743 """Information extractor for soundcloud.com
2744 To access the media, the uid of the song and a stream token
2745 must be extracted from the page source and the script must make
2746 a request to media.soundcloud.com/crossdomain.xml. Then
2747 the media can be grabbed by requesting from an url composed
2748 of the stream token and uid
2751 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2752 IE_NAME = u'soundcloud'
2754 def __init__(self, downloader=None):
2755 InfoExtractor.__init__(self, downloader)
2757 def report_resolve(self, video_id):
2758 """Report information extraction."""
2759 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2761 def report_extraction(self, video_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2765 def _real_extract(self, url):
2766 mobj = re.match(self._VALID_URL, url)
2768 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2771 # extract uploader (which is in the url)
2772 uploader = mobj.group(1)
2773 # extract simple title (uploader + slug of song title)
2774 slug_title = mobj.group(2)
2775 simple_title = uploader + u'-' + slug_title
2777 self.report_resolve('%s/%s' % (uploader, slug_title))
2779 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2780 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2781 request = compat_urllib_request.Request(resolv_url)
2783 info_json_bytes = compat_urllib_request.urlopen(request).read()
2784 info_json = info_json_bytes.decode('utf-8')
2785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2786 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2789 info = json.loads(info_json)
2790 video_id = info['id']
2791 self.report_extraction('%s/%s' % (uploader, slug_title))
2793 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794 request = compat_urllib_request.Request(streams_url)
2796 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2797 stream_json = stream_json_bytes.decode('utf-8')
2798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2802 streams = json.loads(stream_json)
2803 mediaURL = streams['http_mp3_128_url']
2808 'uploader': info['user']['username'],
2809 'upload_date': info['created_at'],
2810 'title': info['title'],
2812 'description': info['description'],
2816 class InfoQIE(InfoExtractor):
2817 """Information extractor for infoq.com"""
2818 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2820 def report_extraction(self, video_id):
2821 """Report information extraction."""
2822 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2824 def _real_extract(self, url):
2825 mobj = re.match(self._VALID_URL, url)
2827 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2830 webpage = self._download_webpage(url, video_id=url)
2831 self.report_extraction(url)
2834 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2836 self._downloader.trouble(u'ERROR: unable to extract video url')
2838 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2839 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2842 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2844 self._downloader.trouble(u'ERROR: unable to extract video title')
2846 video_title = mobj.group(1)
2848 # Extract description
2849 video_description = u'No description available.'
2850 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2851 if mobj is not None:
2852 video_description = mobj.group(1)
2854 video_filename = video_url.split('/')[-1]
2855 video_id, extension = video_filename.split('.')
2861 'upload_date': None,
2862 'title': video_title,
2863 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2865 'description': video_description,
2870 class MixcloudIE(InfoExtractor):
2871 """Information extractor for www.mixcloud.com"""
2873 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2874 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2875 IE_NAME = u'mixcloud'
2877 def __init__(self, downloader=None):
2878 InfoExtractor.__init__(self, downloader)
2880 def report_download_json(self, file_id):
2881 """Report JSON download."""
2882 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2884 def report_extraction(self, file_id):
2885 """Report information extraction."""
2886 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2888 def get_urls(self, jsonData, fmt, bitrate='best'):
2889 """Get urls from 'audio_formats' section in json"""
2892 bitrate_list = jsonData[fmt]
2893 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2894 bitrate = max(bitrate_list) # select highest
2896 url_list = jsonData[fmt][bitrate]
2897 except TypeError: # we have no bitrate info.
2898 url_list = jsonData[fmt]
2901 def check_urls(self, url_list):
2902 """Returns 1st active url from list"""
2903 for url in url_list:
2905 compat_urllib_request.urlopen(url)
2907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2912 def _print_formats(self, formats):
2913 print('Available formats:')
2914 for fmt in formats.keys():
2915 for b in formats[fmt]:
2917 ext = formats[fmt][b][0]
2918 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2919 except TypeError: # we have no bitrate info
2920 ext = formats[fmt][0]
2921 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2924 def _real_extract(self, url):
2925 mobj = re.match(self._VALID_URL, url)
2927 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2929 # extract uploader & filename from url
2930 uploader = mobj.group(1).decode('utf-8')
2931 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2933 # construct API request
2934 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2935 # retrieve .json file with links to files
2936 request = compat_urllib_request.Request(file_url)
2938 self.report_download_json(file_url)
2939 jsonData = compat_urllib_request.urlopen(request).read()
2940 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2941 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2945 json_data = json.loads(jsonData)
2946 player_url = json_data['player_swf_url']
2947 formats = dict(json_data['audio_formats'])
2949 req_format = self._downloader.params.get('format', None)
2952 if self._downloader.params.get('listformats', None):
2953 self._print_formats(formats)
2956 if req_format is None or req_format == 'best':
2957 for format_param in formats.keys():
2958 url_list = self.get_urls(formats, format_param)
2960 file_url = self.check_urls(url_list)
2961 if file_url is not None:
2964 if req_format not in formats:
2965 self._downloader.trouble(u'ERROR: format is not available')
2968 url_list = self.get_urls(formats, req_format)
2969 file_url = self.check_urls(url_list)
2970 format_param = req_format
2973 'id': file_id.decode('utf-8'),
2974 'url': file_url.decode('utf-8'),
2975 'uploader': uploader.decode('utf-8'),
2976 'upload_date': None,
2977 'title': json_data['name'],
2978 'ext': file_url.split('.')[-1].decode('utf-8'),
2979 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2980 'thumbnail': json_data['thumbnail_url'],
2981 'description': json_data['description'],
2982 'player_url': player_url.decode('utf-8'),
2985 class StanfordOpenClassroomIE(InfoExtractor):
2986 """Information extractor for Stanford's Open ClassRoom"""
2988 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2989 IE_NAME = u'stanfordoc'
2991 def report_download_webpage(self, objid):
2992 """Report information extraction."""
2993 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2995 def report_extraction(self, video_id):
2996 """Report information extraction."""
2997 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2999 def _real_extract(self, url):
3000 mobj = re.match(self._VALID_URL, url)
3002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3005 if mobj.group('course') and mobj.group('video'): # A specific video
3006 course = mobj.group('course')
3007 video = mobj.group('video')
3009 'id': course + '_' + video,
3011 'upload_date': None,
3014 self.report_extraction(info['id'])
3015 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3016 xmlUrl = baseUrl + video + '.xml'
3018 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3019 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3020 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3022 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3024 info['title'] = mdoc.findall('./title')[0].text
3025 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3027 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3029 info['ext'] = info['url'].rpartition('.')[2]
3031 elif mobj.group('course'): # A course page
3032 course = mobj.group('course')
3037 'upload_date': None,
3040 self.report_download_webpage(info['id'])
3042 coursepage = compat_urllib_request.urlopen(url).read()
3043 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3044 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3047 m = re.search('<h1>([^<]+)</h1>', coursepage)
3049 info['title'] = unescapeHTML(m.group(1))
3051 info['title'] = info['id']
3053 m = re.search('<description>([^<]+)</description>', coursepage)
3055 info['description'] = unescapeHTML(m.group(1))
3057 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3060 'type': 'reference',
3061 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3065 for entry in info['list']:
3066 assert entry['type'] == 'reference'
3067 results += self.extract(entry['url'])
3072 'id': 'Stanford OpenClassroom',
3075 'upload_date': None,
3078 self.report_download_webpage(info['id'])
3079 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3081 rootpage = compat_urllib_request.urlopen(rootURL).read()
3082 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3083 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3086 info['title'] = info['id']
3088 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3091 'type': 'reference',
3092 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3097 for entry in info['list']:
3098 assert entry['type'] == 'reference'
3099 results += self.extract(entry['url'])
3102 class MTVIE(InfoExtractor):
3103 """Information extractor for MTV.com"""
3105 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3108 def report_extraction(self, video_id):
3109 """Report information extraction."""
3110 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3112 def _real_extract(self, url):
3113 mobj = re.match(self._VALID_URL, url)
3115 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3117 if not mobj.group('proto'):
3118 url = 'http://' + url
3119 video_id = mobj.group('videoid')
3121 webpage = self._download_webpage(url, video_id)
3123 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3125 self._downloader.trouble(u'ERROR: unable to extract song name')
3127 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3128 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3130 self._downloader.trouble(u'ERROR: unable to extract performer')
3132 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3133 video_title = performer + ' - ' + song_name
3135 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3137 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3139 mtvn_uri = mobj.group(1)
3141 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3143 self._downloader.trouble(u'ERROR: unable to extract content id')
3145 content_id = mobj.group(1)
3147 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3148 self.report_extraction(video_id)
3149 request = compat_urllib_request.Request(videogen_url)
3151 metadataXml = compat_urllib_request.urlopen(request).read()
3152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3153 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3156 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3157 renditions = mdoc.findall('.//rendition')
3159 # For now, always pick the highest quality.
3160 rendition = renditions[-1]
3163 _,_,ext = rendition.attrib['type'].partition('/')
3164 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3165 video_url = rendition.find('./src').text
3167 self._downloader.trouble('Invalid rendition field.')
3173 'uploader': performer,
3174 'upload_date': None,
3175 'title': video_title,
3183 class YoukuIE(InfoExtractor):
3184 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3186 def report_download_webpage(self, file_id):
3187 """Report webpage download."""
3188 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3190 def report_extraction(self, file_id):
3191 """Report information extraction."""
3192 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3195 nowTime = int(time.time() * 1000)
3196 random1 = random.randint(1000,1998)
3197 random2 = random.randint(1000,9999)
3199 return "%d%d%d" %(nowTime,random1,random2)
3201 def _get_file_ID_mix_string(self, seed):
3203 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3205 for i in range(len(source)):
3206 seed = (seed * 211 + 30031 ) % 65536
3207 index = math.floor(seed / 65536 * len(source) )
3208 mixed.append(source[int(index)])
3209 source.remove(source[int(index)])
3210 #return ''.join(mixed)
3213 def _get_file_id(self, fileId, seed):
3214 mixed = self._get_file_ID_mix_string(seed)
3215 ids = fileId.split('*')
3219 realId.append(mixed[int(ch)])
3220 return ''.join(realId)
3222 def _real_extract(self, url):
3223 mobj = re.match(self._VALID_URL, url)
3225 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3227 video_id = mobj.group('ID')
3229 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3231 request = compat_urllib_request.Request(info_url, None, std_headers)
3233 self.report_download_webpage(video_id)
3234 jsondata = compat_urllib_request.urlopen(request).read()
3235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3239 self.report_extraction(video_id)
3241 jsonstr = jsondata.decode('utf-8')
3242 config = json.loads(jsonstr)
3244 video_title = config['data'][0]['title']
3245 seed = config['data'][0]['seed']
3247 format = self._downloader.params.get('format', None)
3248 supported_format = list(config['data'][0]['streamfileids'].keys())
3250 if format is None or format == 'best':
3251 if 'hd2' in supported_format:
3256 elif format == 'worst':
3264 fileid = config['data'][0]['streamfileids'][format]
3265 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3266 except (UnicodeDecodeError, ValueError, KeyError):
3267 self._downloader.trouble(u'ERROR: unable to extract info section')
3271 sid = self._gen_sid()
3272 fileid = self._get_file_id(fileid, seed)
3274 #column 8,9 of fileid represent the segment number
3275 #fileid[7:9] should be changed
3276 for index, key in enumerate(keys):
3278 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3279 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3282 'id': '%s_part%02d' % (video_id, index),
3283 'url': download_url,
3285 'upload_date': None,
3286 'title': video_title,
3289 files_info.append(info)
3294 class XNXXIE(InfoExtractor):
3295 """Information extractor for xnxx.com"""
3297 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3299 VIDEO_URL_RE = r'flv_url=(.*?)&'
3300 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3301 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3303 def report_webpage(self, video_id):
3304 """Report information extraction"""
3305 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3307 def report_extraction(self, video_id):
3308 """Report information extraction"""
3309 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3311 def _real_extract(self, url):
3312 mobj = re.match(self._VALID_URL, url)
3314 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3316 video_id = mobj.group(1)
3318 self.report_webpage(video_id)
3320 # Get webpage content
3322 webpage_bytes = compat_urllib_request.urlopen(url).read()
3323 webpage = webpage_bytes.decode('utf-8')
3324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3325 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3328 result = re.search(self.VIDEO_URL_RE, webpage)
3330 self._downloader.trouble(u'ERROR: unable to extract video url')
3332 video_url = compat_urllib_parse.unquote(result.group(1))
3334 result = re.search(self.VIDEO_TITLE_RE, webpage)
3336 self._downloader.trouble(u'ERROR: unable to extract video title')
3338 video_title = result.group(1)
3340 result = re.search(self.VIDEO_THUMB_RE, webpage)
3342 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3344 video_thumbnail = result.group(1)
3350 'upload_date': None,
3351 'title': video_title,
3353 'thumbnail': video_thumbnail,
3354 'description': None,
3358 class GooglePlusIE(InfoExtractor):
3359 """Information extractor for plus.google.com."""
3361 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3362 IE_NAME = u'plus.google'
3364 def __init__(self, downloader=None):
3365 InfoExtractor.__init__(self, downloader)
3367 def report_extract_entry(self, url):
3368 """Report downloading extry"""
3369 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3371 def report_date(self, upload_date):
3372 """Report downloading extry"""
3373 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3375 def report_uploader(self, uploader):
3376 """Report downloading extry"""
3377 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3379 def report_title(self, video_title):
3380 """Report downloading extry"""
3381 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3383 def report_extract_vid_page(self, video_page):
3384 """Report information extraction."""
3385 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3387 def _real_extract(self, url):
3388 # Extract id from URL
3389 mobj = re.match(self._VALID_URL, url)
3391 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3394 post_url = mobj.group(0)
3395 video_id = mobj.group(1)
3397 video_extension = 'flv'
3399 # Step 1, Retrieve post webpage to extract further information
3400 self.report_extract_entry(post_url)
3401 request = compat_urllib_request.Request(post_url)
3403 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3404 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3405 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3408 # Extract update date
3410 pattern = 'title="Timestamp">(.*?)</a>'
3411 mobj = re.search(pattern, webpage)
3413 upload_date = mobj.group(1)
3414 # Convert timestring to a format suitable for filename
3415 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3416 upload_date = upload_date.strftime('%Y%m%d')
3417 self.report_date(upload_date)
3421 pattern = r'rel\="author".*?>(.*?)</a>'
3422 mobj = re.search(pattern, webpage)
3424 uploader = mobj.group(1)
3425 self.report_uploader(uploader)
3428 # Get the first line for title
3430 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3431 mobj = re.search(pattern, webpage)
3433 video_title = mobj.group(1)
3434 self.report_title(video_title)
3436 # Step 2, Stimulate clicking the image box to launch video
3437 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3438 mobj = re.search(pattern, webpage)
3440 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3442 video_page = mobj.group(1)
3443 request = compat_urllib_request.Request(video_page)
3445 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3447 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3449 self.report_extract_vid_page(video_page)
3452 # Extract video links on video page
3453 """Extract video links of all sizes"""
3454 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3455 mobj = re.findall(pattern, webpage)
3457 self._downloader.trouble(u'ERROR: unable to extract video links')
3459 # Sort in resolution
3460 links = sorted(mobj)
3462 # Choose the lowest of the sort, i.e. highest resolution
3463 video_url = links[-1]
3464 # Only get the url. The resolution part in the tuple has no use anymore
3465 video_url = video_url[-1]
3466 # Treat escaped \u0026 style hex
3468 video_url = video_url.decode("unicode_escape")
3469 except AttributeError: # Python 3
3470 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3476 'uploader': uploader,
3477 'upload_date': upload_date,
3478 'title': video_title,
3479 'ext': video_extension,
3482 class NBAIE(InfoExtractor):
3483 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3486 def _real_extract(self, url):
3487 mobj = re.match(self._VALID_URL, url)
3489 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3492 video_id = mobj.group(1)
3493 if video_id.endswith('/index.html'):
3494 video_id = video_id[:-len('/index.html')]
3496 webpage = self._download_webpage(url, video_id)
3498 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3499 def _findProp(rexp, default=None):
3500 m = re.search(rexp, webpage)
3502 return unescapeHTML(m.group(1))
3506 shortened_video_id = video_id.rpartition('/')[2]
3507 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3509 'id': shortened_video_id,
3513 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3514 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3518 class JustinTVIE(InfoExtractor):
3519 """Information extractor for justin.tv and twitch.tv"""
3520 # TODO: One broadcast may be split into multiple videos. The key
3521 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3522 # starts at 1 and increases. Can we treat all parts as one video?
3524 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3525 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3526 _JUSTIN_PAGE_LIMIT = 100
3527 IE_NAME = u'justin.tv'
3529 def report_extraction(self, file_id):
3530 """Report information extraction."""
3531 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3533 def report_download_page(self, channel, offset):
3534 """Report attempt to download a single page of videos."""
3535 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3536 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3538 # Return count of items, list of *valid* items
3539 def _parse_page(self, url):
3541 urlh = compat_urllib_request.urlopen(url)
3542 webpage_bytes = urlh.read()
3543 webpage = webpage_bytes.decode('utf-8', 'ignore')
3544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3545 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3548 response = json.loads(webpage)
3550 for clip in response:
3551 video_url = clip['video_file_url']
3553 video_extension = os.path.splitext(video_url)[1][1:]
3554 video_date = re.sub('-', '', clip['created_on'][:10])
3558 'title': clip['title'],
3559 'uploader': clip.get('user_id', clip.get('channel_id')),
3560 'upload_date': video_date,
3561 'ext': video_extension,
3563 return (len(response), info)
3565 def _real_extract(self, url):
3566 mobj = re.match(self._VALID_URL, url)
3568 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3571 api = 'http://api.justin.tv'
3572 video_id = mobj.group(mobj.lastindex)
3574 if mobj.lastindex == 1:
3576 api += '/channel/archives/%s.json'
3578 api += '/clip/show/%s.json'
3579 api = api % (video_id,)
3581 self.report_extraction(video_id)
3585 limit = self._JUSTIN_PAGE_LIMIT
3588 self.report_download_page(video_id, offset)
3589 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3590 page_count, page_info = self._parse_page(page_url)
3591 info.extend(page_info)
3592 if not paged or page_count != limit:
3597 class FunnyOrDieIE(InfoExtractor):
3598 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3600 def _real_extract(self, url):
3601 mobj = re.match(self._VALID_URL, url)
3603 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3606 video_id = mobj.group('id')
3607 webpage = self._download_webpage(url, video_id)
3609 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3611 self._downloader.trouble(u'ERROR: unable to find video information')
3612 video_url = unescapeHTML(m.group('url'))
3614 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3616 self._downloader.trouble(u'Cannot find video title')
3617 title = unescapeHTML(m.group('title'))
3619 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3621 desc = unescapeHTML(m.group('desc'))
3630 'description': desc,
3634 class TweetReelIE(InfoExtractor):
3635 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3637 def _real_extract(self, url):
3638 mobj = re.match(self._VALID_URL, url)
3640 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3643 video_id = mobj.group('id')
3644 webpage = self._download_webpage(url, video_id)
3646 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3648 self._downloader.trouble(u'ERROR: Cannot find status ID')
3649 status_id = m.group(1)
3651 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3653 self._downloader.trouble(u'WARNING: Cannot find description')
3654 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3656 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3658 self._downloader.trouble(u'ERROR: Cannot find uploader')
3659 uploader = unescapeHTML(m.group('uploader'))
3660 uploader_id = unescapeHTML(m.group('uploader_id'))
3662 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3664 self._downloader.trouble(u'ERROR: Cannot find upload date')
3665 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3668 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3675 'description': desc,
3676 'uploader': uploader,
3677 'uploader_id': uploader_id,
3678 'internal_id': status_id,
3679 'upload_date': upload_date
3683 class SteamIE(InfoExtractor):
3684 _VALID_URL = r"""http://store.steampowered.com/
3685 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3687 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3690 def suitable(self, url):
3691 """Receives a URL and returns True if suitable for this IE."""
3692 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3694 def _real_extract(self, url):
3695 m = re.match(self._VALID_URL, url, re.VERBOSE)
3696 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3697 gameID = m.group('gameID')
3698 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3699 webpage = self._download_webpage(videourl, gameID)
3700 mweb = re.finditer(urlRE, webpage)
3701 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3702 titles = re.finditer(namesRE, webpage)
3704 for vid,vtitle in zip(mweb,titles):
3705 video_id = vid.group('videoID')
3706 title = vtitle.group('videoName')
3707 video_url = vid.group('videoURL')
3709 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3714 'title': unescapeHTML(title)
3719 class UstreamIE(InfoExtractor):
3720 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3721 IE_NAME = u'ustream'
3723 def _real_extract(self, url):
3724 m = re.match(self._VALID_URL, url)
3725 video_id = m.group('videoID')
3726 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3727 webpage = self._download_webpage(url, video_id)
3728 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3729 title = m.group('title')
3730 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3731 uploader = m.group('uploader')
3737 'uploader': uploader
3743 class YouPornIE(InfoExtractor):
3744 """Information extractor for youporn.com."""
3746 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3747 IE_NAME = u'youporn'
3748 VIDEO_TITLE_RE = r'videoTitleArea">(?P<title>.*)</h1>'
3749 VIDEO_DATE_RE = r'Date:</b>(?P<date>.*)</li>'
3750 VIDEO_UPLOADER_RE = r'Submitted:</b>(?P<uploader>.*)</li>'
3751 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3752 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3754 def __init__(self, downloader=None):
3755 InfoExtractor.__init__(self, downloader)
3757 def report_id(self, video_id):
3758 """Report finding video ID"""
3759 self._downloader.to_screen(u'[youporn] Video ID: %s' % video_id)
3761 def report_webpage(self, url):
3762 """Report downloading page"""
3763 self._downloader.to_screen(u'[youporn] Downloaded page: %s' % url)
3765 def report_title(self, video_title):
3766 """Report dfinding title"""
3767 self._downloader.to_screen(u'[youporn] Title: %s' % video_title)
3769 def report_uploader(self, uploader):
3770 """Report dfinding title"""
3771 self._downloader.to_screen(u'[youporn] Uploader: %s' % uploader)
3773 def report_upload_date(self, video_date):
3774 """Report finding date"""
3775 self._downloader.to_screen(u'[youporn] Date: %s' % video_date)
3777 def _print_formats(self, formats):
3778 """Print all available formats"""
3779 print 'Available formats:'
3780 print u'ext\t\tformat'
3781 print u'---------------------------------'
3782 for format in formats:
3783 print u'%s\t\t%s' % (format['ext'], format['format'])
3785 def _specific(self, req_format, formats):
3787 if(x["format"]==req_format):
3792 def _real_extract(self, url):
3793 mobj = re.match(self._VALID_URL, url)
3795 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3798 video_id = mobj.group('videoid').decode('utf-8')
3799 self.report_id(video_id)
3801 # Get webpage content
3803 webpage = urllib2.urlopen(url).read()
3804 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3805 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3807 self.report_webpage(url)
3809 # Get the video title
3810 result = re.search(self.VIDEO_TITLE_RE, webpage)
3812 self._downloader.trouble(u'ERROR: unable to extract video title')
3814 video_title = result.group('title').decode('utf-8').strip()
3815 self.report_title(video_title)
3817 # Get the video date
3818 result = re.search(self.VIDEO_DATE_RE, webpage)
3820 self._downloader.trouble(u'ERROR: unable to extract video date')
3822 upload_date = result.group('date').decode('utf-8').strip()
3823 self.report_upload_date(upload_date)
3825 # Get the video uploader
3826 result = re.search(self.VIDEO_UPLOADER_RE, webpage)
3828 self._downloader.trouble(u'ERROR: unable to extract uploader')
3830 video_uploader = result.group('uploader').decode('utf-8').strip()
3831 video_uploader = clean_html( video_uploader )
3832 self.report_uploader(video_uploader)
3834 # Get all of the formats available
3835 result = re.search(self.DOWNLOAD_LIST_RE, webpage)
3837 self._downloader.trouble(u'ERROR: unable to extract download list')
3839 download_list_html = result.group('download_list').decode('utf-8').strip()
3841 # Get all of the links from the page
3842 links = re.findall(self.LINK_RE, download_list_html)
3843 if(len(links) == 0):
3844 self._downloader.trouble(u'ERROR: no known formats available for video')
3847 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3852 # A link looks like this:
3853 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3854 # A path looks like this:
3855 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3856 video_url = unescapeHTML( link.decode('utf-8') )
3857 path = urlparse( video_url ).path
3858 extension = os.path.splitext( path )[1][1:]
3859 format = path.split('/')[4].split('_')[:2]
3862 format = "-".join( format )
3863 title = u'%s-%s-%s' % (video_title, size, bitrate)
3868 'uploader': video_uploader,
3869 'upload_date': upload_date,
3874 'description': None,
3878 if self._downloader.params.get('listformats', None):
3879 self._print_formats(formats)
3882 req_format = self._downloader.params.get('format', None)
3883 #format_limit = self._downloader.params.get('format_limit', None)
3884 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3887 if req_format is None or req_format == 'best':
3889 elif req_format == 'worst':
3890 return [formats[-1]]
3891 elif req_format in ('-1', 'all'):
3894 format = self._specific( req_format, formats )
3896 self._downloader.trouble(u'ERROR: requested format not available')
3902 class PornotubeIE(InfoExtractor):
3903 """Information extractor for pornotube.com."""
3905 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3906 IE_NAME = u'pornotube'
3907 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3908 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3911 def __init__(self, downloader=None):
3912 InfoExtractor.__init__(self, downloader)
3914 def report_extract_entry(self, url):
3915 """Report downloading extry"""
3916 self._downloader.to_screen(u'[pornotube] Downloading entry: %s' % url.decode('utf-8'))
3918 def report_date(self, upload_date):
3919 """Report finding uploaded date"""
3920 self._downloader.to_screen(u'[pornotube] Entry date: %s' % upload_date)
3922 def report_webpage(self, url):
3923 """Report downloading page"""
3924 self._downloader.to_screen(u'[pornotube] Downloaded page: %s' % url)
3926 def report_title(self, video_title):
3927 """Report downloading extry"""
3928 self._downloader.to_screen(u'[pornotube] Title: %s' % video_title.decode('utf-8'))
3930 def _real_extract(self, url):
3931 mobj = re.match(self._VALID_URL, url)
3933 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3936 video_id = mobj.group('videoid').decode('utf-8')
3937 video_title = mobj.group('title').decode('utf-8')
3938 self.report_title(video_title);
3940 # Get webpage content
3942 webpage = urllib2.urlopen(url).read()
3943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3944 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3946 self.report_webpage(url)
3949 result = re.search(self.VIDEO_URL_RE, webpage)
3951 self._downloader.trouble(u'ERROR: unable to extract video url')
3953 video_url = urllib.unquote(result.group('url').decode('utf-8'))
3954 self.report_extract_entry(video_url)
3956 #Get the uploaded date
3957 result = re.search(self.VIDEO_UPLOADED_RE, webpage)
3959 self._downloader.trouble(u'ERROR: unable to extract video title')
3961 upload_date = result.group('date').decode('utf-8')
3962 self.report_date(upload_date);
3965 info = {'id': video_id,
3968 'upload_date': upload_date,
3969 'title': video_title,
3973 'description': None,
3980 class YouJizzIE(InfoExtractor):
3981 """Information extractor for youjizz.com."""
3983 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/([^.]+).html$'
3984 IE_NAME = u'youjizz'
3985 VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
3986 EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
3987 SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
3989 def __init__(self, downloader=None):
3990 InfoExtractor.__init__(self, downloader)
3992 def report_extract_entry(self, url):
3993 """Report downloading extry"""
3994 self._downloader.to_screen(u'[youjizz] Downloading entry: %s' % url.decode('utf-8'))
3996 def report_webpage(self, url):
3997 """Report downloading page"""
3998 self._downloader.to_screen(u'[youjizz] Downloaded page: %s' % url)
4000 def report_title(self, video_title):
4001 """Report downloading extry"""
4002 self._downloader.to_screen(u'[youjizz] Title: %s' % video_title.decode('utf-8'))
4004 def report_embed_page(self, embed_page):
4005 """Report downloading extry"""
4006 self._downloader.to_screen(u'[youjizz] Embed Page: %s' % embed_page.decode('utf-8'))
4008 def _real_extract(self, url):
4009 # Get webpage content
4011 webpage = urllib2.urlopen(url).read()
4012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4013 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
4015 self.report_webpage(url)
4017 # Get the video title
4018 result = re.search(self.VIDEO_TITLE_RE, webpage)
4020 self._downloader.trouble(u'ERROR: unable to extract video title')
4022 video_title = result.group('title').decode('utf-8').strip()
4023 self.report_title(video_title)
4025 # Get the embed page
4026 result = re.search(self.EMBED_PAGE_RE, webpage)
4028 self._downloader.trouble(u'ERROR: unable to extract embed page')
4031 embed_page_url = result.group(0).decode('utf-8').strip()
4032 video_id = result.group('videoid').decode('utf-8')
4033 self.report_embed_page(embed_page_url)
4036 webpage = urllib2.urlopen(embed_page_url).read()
4037 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
4038 self._downloader.trouble(u'ERROR: unable to download video embed page: %s' % err)
4042 result = re.search(self.SOURCE_RE, webpage)
4044 self._downloader.trouble(u'ERROR: unable to extract video url')
4046 video_url = result.group('source').decode('utf-8')
4047 self.report_extract_entry(video_url)
4049 info = {'id': video_id,
4052 'upload_date': None,
4053 'title': video_title,
4057 'description': None,
4058 'player_url': embed_page_url}
4063 def gen_extractors():
4064 """ Return a list of an instance of every supported extractor.
4065 The order does matter; the first extractor matched is the one handling the URL.
4068 YoutubePlaylistIE(),
4092 StanfordOpenClassroomIE(),