2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
52 The fields should all be Unicode strings.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
72 self.set_downloader(downloader)
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
79 """Getter method for _WORKING."""
83 """Initializes an instance (authentication, etc)."""
85 self._real_initialize()
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
91 return self._real_extract(url)
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
107 return type(self).__name__[:-2]
109 def _download_webpage(self, url, video_id, note=None, errnote=None):
111 note = u'Downloading video webpage'
112 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
114 urlh = compat_urllib_request.urlopen(url)
115 webpage_bytes = urlh.read()
116 return webpage_bytes.decode('utf-8', 'replace')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
123 class YoutubeIE(InfoExtractor):
124 """Information extractor for youtube.com."""
128 (?:https?://)? # http(s):// (optional)
129 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
131 (?:.*?\#/)? # handle anchor (#/) redirect urls
132 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
133 (?: # the various things that can precede the ID:
134 (?:(?:v|embed|e)/) # v/ or embed/ or e/
135 |(?: # or the v= param in all its forms
136 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137 (?:\?|\#!?) # the params delimiter ? or # or #!
138 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
141 )? # optional -> youtube.com/xxxx is OK
142 )? # all until now is optional -> you can pass the naked ID
143 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
144 (?(1).+)? # if we found the ID, everything can follow
146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150 _NETRC_MACHINE = 'youtube'
151 # Listed in order of quality
152 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154 _video_extensions = {
160 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
166 _video_dimensions = {
184 def suitable(self, url):
185 """Receives a URL and returns True if suitable for this IE."""
186 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
188 def report_lang(self):
189 """Report attempt to set language."""
190 self._downloader.to_screen(u'[youtube] Setting language')
192 def report_login(self):
193 """Report attempt to log in."""
194 self._downloader.to_screen(u'[youtube] Logging in')
196 def report_age_confirmation(self):
197 """Report attempt to confirm age."""
198 self._downloader.to_screen(u'[youtube] Confirming age')
200 def report_video_webpage_download(self, video_id):
201 """Report attempt to download video webpage."""
202 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
204 def report_video_info_webpage_download(self, video_id):
205 """Report attempt to download video info webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
208 def report_video_subtitles_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
212 def report_information_extraction(self, video_id):
213 """Report attempt to extract video information."""
214 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
216 def report_unavailable_format(self, video_id, format):
217 """Report extracted video URL."""
218 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
220 def report_rtmp_download(self):
221 """Indicate the download will use the RTMP protocol."""
222 self._downloader.to_screen(u'[youtube] RTMP download detected')
224 def _closed_captions_xml_to_srt(self, xml_string):
226 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227 # TODO parse xml instead of regex
228 for n, (start, dur_tag, dur, caption) in enumerate(texts):
229 if not dur: dur = '4'
231 end = start + float(dur)
232 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234 caption = unescapeHTML(caption)
235 caption = unescapeHTML(caption) # double cycle, intentional
236 srt += str(n+1) + '\n'
237 srt += start + ' --> ' + end + '\n'
238 srt += caption + '\n\n'
241 def _extract_subtitles(self, video_id):
242 self.report_video_subtitles_download(video_id)
243 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
245 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250 if not srt_lang_list:
251 return (u'WARNING: video has no closed captions', None)
252 if self._downloader.params.get('subtitleslang', False):
253 srt_lang = self._downloader.params.get('subtitleslang')
254 elif 'en' in srt_lang_list:
257 srt_lang = list(srt_lang_list.keys())[0]
258 if not srt_lang in srt_lang_list:
259 return (u'WARNING: no closed captions found in the specified language', None)
260 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
262 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
266 return (u'WARNING: unable to download video subtitles', None)
267 return (None, self._closed_captions_xml_to_srt(srt_xml))
269 def _print_formats(self, formats):
270 print('Available formats:')
272 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
274 def _real_initialize(self):
275 if self._downloader is None:
280 downloader_params = self._downloader.params
282 # Attempt to use provided username and password or .netrc data
283 if downloader_params.get('username', None) is not None:
284 username = downloader_params['username']
285 password = downloader_params['password']
286 elif downloader_params.get('usenetrc', False):
288 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
293 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294 except (IOError, netrc.NetrcParseError) as err:
295 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
299 request = compat_urllib_request.Request(self._LANG_URL)
302 compat_urllib_request.urlopen(request).read()
303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
307 # No authentication to be performed
313 'current_form': 'loginForm',
315 'action_login': 'Log In',
316 'username': username,
317 'password': password,
319 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
322 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
333 'action_confirm': 'Confirm',
335 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
337 self.report_age_confirmation()
338 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
343 def _extract_id(self, url):
344 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
348 video_id = mobj.group(2)
351 def _real_extract(self, url):
352 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353 mobj = re.search(self._NEXT_URL_RE, url)
355 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356 video_id = self._extract_id(url)
359 self.report_video_webpage_download(video_id)
360 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361 request = compat_urllib_request.Request(url)
363 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
368 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
370 # Attempt to extract SWF player URL
371 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
373 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
378 self.report_video_info_webpage_download(video_id)
379 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381 % (video_id, el_type))
382 request = compat_urllib_request.Request(video_info_url)
384 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386 video_info = compat_parse_qs(video_info_webpage)
387 if 'token' in video_info:
389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
392 if 'token' not in video_info:
393 if 'reason' in video_info:
394 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
396 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
399 # Check for "rental" videos
400 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401 self._downloader.trouble(u'ERROR: "rental" videos not supported')
404 # Start extracting information
405 self.report_information_extraction(video_id)
408 if 'author' not in video_info:
409 self._downloader.trouble(u'ERROR: unable to extract uploader name')
411 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
414 video_uploader_id = None
415 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
417 video_uploader_id = mobj.group(1)
419 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
422 if 'title' not in video_info:
423 self._downloader.trouble(u'ERROR: unable to extract video title')
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
428 if 'thumbnail_url' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440 for expression in format_expressions:
442 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
447 video_description = get_element_by_id("eow-description", video_webpage)
448 if video_description:
449 video_description = clean_html(video_description)
451 video_description = ''
454 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
458 self._downloader.trouble(srt_error)
460 if 'length_seconds' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video duration')
464 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
467 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
469 # Decide which formats to download
470 req_format = self._downloader.params.get('format', None)
472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473 self.report_rtmp_download()
474 video_url_list = [(None, video_info['conn'][0])]
475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
481 format_limit = self._downloader.params.get('format_limit', None)
482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483 if format_limit is not None and format_limit in available_formats:
484 format_list = available_formats[available_formats.index(format_limit):]
486 format_list = available_formats
487 existing_formats = [x for x in format_list if x in url_map]
488 if len(existing_formats) == 0:
489 self._downloader.trouble(u'ERROR: no known formats available for video')
491 if self._downloader.params.get('listformats', None):
492 self._print_formats(existing_formats)
494 if req_format is None or req_format == 'best':
495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496 elif req_format == 'worst':
497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498 elif req_format in ('-1', 'all'):
499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
501 # Specific formats. We pick the first in a slash-delimeted sequence.
502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503 req_formats = req_format.split('/')
504 video_url_list = None
505 for rf in req_formats:
507 video_url_list = [(rf, url_map[rf])]
509 if video_url_list is None:
510 self._downloader.trouble(u'ERROR: requested format not available')
513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
517 for format_param, video_real_url in video_url_list:
519 video_extension = self._video_extensions.get(format_param, 'flv')
521 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522 self._video_dimensions.get(format_param, '???'))
526 'url': video_real_url,
527 'uploader': video_uploader,
528 'uploader_id': video_uploader_id,
529 'upload_date': upload_date,
530 'title': video_title,
531 'ext': video_extension,
532 'format': video_format,
533 'thumbnail': video_thumbnail,
534 'description': video_description,
535 'player_url': player_url,
536 'subtitles': video_subtitles,
537 'duration': video_duration
542 class MetacafeIE(InfoExtractor):
543 """Information Extractor for metacafe.com."""
545 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548 IE_NAME = u'metacafe'
550 def __init__(self, downloader=None):
551 InfoExtractor.__init__(self, downloader)
553 def report_disclaimer(self):
554 """Report disclaimer retrieval."""
555 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
557 def report_age_confirmation(self):
558 """Report attempt to confirm age."""
559 self._downloader.to_screen(u'[metacafe] Confirming age')
561 def report_download_webpage(self, video_id):
562 """Report webpage download."""
563 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
565 def report_extraction(self, video_id):
566 """Report information extraction."""
567 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
569 def _real_initialize(self):
570 # Retrieve disclaimer
571 request = compat_urllib_request.Request(self._DISCLAIMER)
573 self.report_disclaimer()
574 disclaimer = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
582 'submit': "Continue - I'm over 18",
584 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
586 self.report_age_confirmation()
587 disclaimer = compat_urllib_request.urlopen(request).read()
588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
592 def _real_extract(self, url):
593 # Extract id and simplified title from URL
594 mobj = re.match(self._VALID_URL, url)
596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
599 video_id = mobj.group(1)
601 # Check if video comes from YouTube
602 mobj2 = re.match(r'^yt-(.*)$', video_id)
603 if mobj2 is not None:
604 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
607 # Retrieve video webpage to extract further information
608 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
610 self.report_download_webpage(video_id)
611 webpage = compat_urllib_request.urlopen(request).read()
612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
616 # Extract URL, uploader and title from webpage
617 self.report_extraction(video_id)
618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
620 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621 video_extension = mediaURL[-3:]
623 # Extract gdaKey if available
624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
628 gdaKey = mobj.group(1)
629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
633 self._downloader.trouble(u'ERROR: unable to extract media URL')
635 vardict = compat_parse_qs(mobj.group(1))
636 if 'mediaData' not in vardict:
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = mobj.group(1).replace('\\/', '/')
644 video_extension = mediaURL[-3:]
645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = mobj.group(1).decode('utf-8')
653 mobj = re.search(r'submitter=(.*?);', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
669 class DailymotionIE(InfoExtractor):
670 """Information Extractor for Dailymotion"""
672 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673 IE_NAME = u'dailymotion'
675 def __init__(self, downloader=None):
676 InfoExtractor.__init__(self, downloader)
678 def report_download_webpage(self, video_id):
679 """Report webpage download."""
680 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
682 def report_extraction(self, video_id):
683 """Report information extraction."""
684 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
686 def _real_extract(self, url):
687 # Extract id and simplified title from URL
688 mobj = re.match(self._VALID_URL, url)
690 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
693 video_id = mobj.group(1).split('_')[0].split('?')[0]
695 video_extension = 'mp4'
697 # Retrieve video webpage to extract further information
698 request = compat_urllib_request.Request(url)
699 request.add_header('Cookie', 'family_filter=off')
701 self.report_download_webpage(video_id)
702 webpage_bytes = compat_urllib_request.urlopen(request).read()
703 webpage = webpage_bytes.decode('utf-8')
704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
705 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
708 # Extract URL, uploader and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
712 self._downloader.trouble(u'ERROR: unable to extract media URL')
714 flashvars = compat_urllib_parse.unquote(mobj.group(1))
716 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
719 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
722 self._downloader.trouble(u'ERROR: unable to extract video URL')
725 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
727 self._downloader.trouble(u'ERROR: unable to extract video URL')
730 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
732 # TODO: support choosing qualities
734 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
736 self._downloader.trouble(u'ERROR: unable to extract title')
738 video_title = unescapeHTML(mobj.group('title'))
740 video_uploader = None
741 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
743 # lookin for official user
744 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
745 if mobj_official is None:
746 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
748 video_uploader = mobj_official.group(1)
750 video_uploader = mobj.group(1)
752 video_upload_date = None
753 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
755 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
760 'uploader': video_uploader,
761 'upload_date': video_upload_date,
762 'title': video_title,
763 'ext': video_extension,
767 class PhotobucketIE(InfoExtractor):
768 """Information extractor for photobucket.com."""
770 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
771 IE_NAME = u'photobucket'
773 def __init__(self, downloader=None):
774 InfoExtractor.__init__(self, downloader)
776 def report_download_webpage(self, video_id):
777 """Report webpage download."""
778 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
780 def report_extraction(self, video_id):
781 """Report information extraction."""
782 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
784 def _real_extract(self, url):
785 # Extract id from URL
786 mobj = re.match(self._VALID_URL, url)
788 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
791 video_id = mobj.group(1)
793 video_extension = 'flv'
795 # Retrieve video webpage to extract further information
796 request = compat_urllib_request.Request(url)
798 self.report_download_webpage(video_id)
799 webpage = compat_urllib_request.urlopen(request).read()
800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
801 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
804 # Extract URL, uploader, and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
808 self._downloader.trouble(u'ERROR: unable to extract media URL')
810 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
814 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
816 self._downloader.trouble(u'ERROR: unable to extract title')
818 video_title = mobj.group(1).decode('utf-8')
820 video_uploader = mobj.group(2).decode('utf-8')
823 'id': video_id.decode('utf-8'),
824 'url': video_url.decode('utf-8'),
825 'uploader': video_uploader,
827 'title': video_title,
828 'ext': video_extension.decode('utf-8'),
832 class YahooIE(InfoExtractor):
833 """Information extractor for video.yahoo.com."""
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME = u'video.yahoo'
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
853 def _real_extract(self, url, new_video=True):
854 # Extract ID from URL
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
860 video_id = mobj.group(2)
861 video_extension = 'flv'
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re.match(self._VPAGE_URL, url) is None:
866 request = compat_urllib_request.Request(url)
868 webpage = compat_urllib_request.urlopen(request).read()
869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
875 self._downloader.trouble(u'ERROR: Unable to extract id field')
877 yahoo_id = mobj.group(1)
879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
883 yahoo_vid = mobj.group(1)
885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886 return self._real_extract(url, new_video=False)
888 # Retrieve video webpage to extract further information
889 request = compat_urllib_request.Request(url)
891 self.report_download_webpage(video_id)
892 webpage = compat_urllib_request.urlopen(request).read()
893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
897 # Extract uploader and title from webpage
898 self.report_extraction(video_id)
899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
901 self._downloader.trouble(u'ERROR: unable to extract video title')
903 video_title = mobj.group(1).decode('utf-8')
905 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
907 self._downloader.trouble(u'ERROR: unable to extract video uploader')
909 video_uploader = mobj.group(1).decode('utf-8')
911 # Extract video thumbnail
912 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
914 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
916 video_thumbnail = mobj.group(1).decode('utf-8')
918 # Extract video description
919 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video description')
923 video_description = mobj.group(1).decode('utf-8')
924 if not video_description:
925 video_description = 'No description available.'
927 # Extract video height and width
928 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
930 self._downloader.trouble(u'ERROR: unable to extract video height')
932 yv_video_height = mobj.group(1)
934 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
936 self._downloader.trouble(u'ERROR: unable to extract video width')
938 yv_video_width = mobj.group(1)
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate = '700' # according to Wikipedia this is hard-coded
945 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
949 self.report_download_webpage(video_id)
950 webpage = compat_urllib_request.urlopen(request).read()
951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
955 # Extract media URL from playlist XML
956 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
958 self._downloader.trouble(u'ERROR: Unable to extract media URL')
960 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961 video_url = unescapeHTML(video_url)
964 'id': video_id.decode('utf-8'),
966 'uploader': video_uploader,
968 'title': video_title,
969 'ext': video_extension.decode('utf-8'),
970 'thumbnail': video_thumbnail.decode('utf-8'),
971 'description': video_description,
975 class VimeoIE(InfoExtractor):
976 """Information extractor for vimeo.com."""
978 # _VALID_URL matches Vimeo URLs
979 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
982 def __init__(self, downloader=None):
983 InfoExtractor.__init__(self, downloader)
985 def report_download_webpage(self, video_id):
986 """Report webpage download."""
987 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
989 def report_extraction(self, video_id):
990 """Report information extraction."""
991 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
993 def _real_extract(self, url, new_video=True):
994 # Extract ID from URL
995 mobj = re.match(self._VALID_URL, url)
997 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000 video_id = mobj.group(1)
1002 # Retrieve video webpage to extract further information
1003 request = compat_urllib_request.Request(url, None, std_headers)
1005 self.report_download_webpage(video_id)
1006 webpage_bytes = compat_urllib_request.urlopen(request).read()
1007 webpage = webpage_bytes.decode('utf-8')
1008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1012 # Now we begin extracting as much information as we can from what we
1013 # retrieved. First we extract the information common to all extractors,
1014 # and latter we extract those that are Vimeo specific.
1015 self.report_extraction(video_id)
1017 # Extract the config JSON
1019 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020 config = json.loads(config)
1022 self._downloader.trouble(u'ERROR: unable to extract info section')
1026 video_title = config["video"]["title"]
1028 # Extract uploader and uploader_id
1029 video_uploader = config["video"]["owner"]["name"]
1030 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1032 # Extract video thumbnail
1033 video_thumbnail = config["video"]["thumbnail"]
1035 # Extract video description
1036 video_description = get_element_by_attribute("itemprop", "description", webpage)
1037 if video_description: video_description = clean_html(video_description)
1038 else: video_description = ''
1040 # Extract upload date
1041 video_upload_date = None
1042 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1043 if mobj is not None:
1044 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1046 # Vimeo specific: extract request signature and timestamp
1047 sig = config['request']['signature']
1048 timestamp = config['request']['timestamp']
1050 # Vimeo specific: extract video codec and quality information
1051 # First consider quality, then codecs, then take everything
1052 # TODO bind to format param
1053 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1054 files = { 'hd': [], 'sd': [], 'other': []}
1055 for codec_name, codec_extension in codecs:
1056 if codec_name in config["video"]["files"]:
1057 if 'hd' in config["video"]["files"][codec_name]:
1058 files['hd'].append((codec_name, codec_extension, 'hd'))
1059 elif 'sd' in config["video"]["files"][codec_name]:
1060 files['sd'].append((codec_name, codec_extension, 'sd'))
1062 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1064 for quality in ('hd', 'sd', 'other'):
1065 if len(files[quality]) > 0:
1066 video_quality = files[quality][0][2]
1067 video_codec = files[quality][0][0]
1068 video_extension = files[quality][0][1]
1069 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1072 self._downloader.trouble(u'ERROR: no known codec found')
1075 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1076 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1081 'uploader': video_uploader,
1082 'uploader_id': video_uploader_id,
1083 'upload_date': video_upload_date,
1084 'title': video_title,
1085 'ext': video_extension,
1086 'thumbnail': video_thumbnail,
1087 'description': video_description,
1091 class ArteTvIE(InfoExtractor):
1092 """arte.tv information extractor."""
1094 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1095 _LIVE_URL = r'index-[0-9]+\.html$'
1097 IE_NAME = u'arte.tv'
1099 def __init__(self, downloader=None):
1100 InfoExtractor.__init__(self, downloader)
1102 def report_download_webpage(self, video_id):
1103 """Report webpage download."""
1104 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1106 def report_extraction(self, video_id):
1107 """Report information extraction."""
1108 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1110 def fetch_webpage(self, url):
1111 request = compat_urllib_request.Request(url)
1113 self.report_download_webpage(url)
1114 webpage = compat_urllib_request.urlopen(request).read()
1115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1116 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1118 except ValueError as err:
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1123 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1124 page = self.fetch_webpage(url)
1125 mobj = re.search(regex, page, regexFlags)
1129 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1132 for (i, key, err) in matchTuples:
1133 if mobj.group(i) is None:
1134 self._downloader.trouble(err)
1137 info[key] = mobj.group(i)
1141 def extractLiveStream(self, url):
1142 video_lang = url.split('/')[-4]
1143 info = self.grep_webpage(
1145 r'src="(.*?/videothek_js.*?\.js)',
1148 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1151 http_host = url.split('/')[2]
1152 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1153 info = self.grep_webpage(
1155 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1156 '(http://.*?\.swf).*?' +
1160 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1161 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1162 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1165 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1167 def extractPlus7Stream(self, url):
1168 video_lang = url.split('/')[-3]
1169 info = self.grep_webpage(
1171 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1174 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1177 next_url = compat_urllib_parse.unquote(info.get('url'))
1178 info = self.grep_webpage(
1180 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1183 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1186 next_url = compat_urllib_parse.unquote(info.get('url'))
1188 info = self.grep_webpage(
1190 r'<video id="(.*?)".*?>.*?' +
1191 '<name>(.*?)</name>.*?' +
1192 '<dateVideo>(.*?)</dateVideo>.*?' +
1193 '<url quality="hd">(.*?)</url>',
1196 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1197 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1198 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1199 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1204 'id': info.get('id'),
1205 'url': compat_urllib_parse.unquote(info.get('url')),
1206 'uploader': u'arte.tv',
1207 'upload_date': info.get('date'),
1208 'title': info.get('title').decode('utf-8'),
1214 def _real_extract(self, url):
1215 video_id = url.split('/')[-1]
1216 self.report_extraction(video_id)
1218 if re.search(self._LIVE_URL, video_id) is not None:
1219 self.extractLiveStream(url)
1222 info = self.extractPlus7Stream(url)
1227 class GenericIE(InfoExtractor):
1228 """Generic last-resort information extractor."""
1231 IE_NAME = u'generic'
1233 def __init__(self, downloader=None):
1234 InfoExtractor.__init__(self, downloader)
1236 def report_download_webpage(self, video_id):
1237 """Report webpage download."""
1238 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1239 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1241 def report_extraction(self, video_id):
1242 """Report information extraction."""
1243 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1245 def report_following_redirect(self, new_url):
1246 """Report information extraction."""
1247 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1249 def _test_redirect(self, url):
1250 """Check if it is a redirect, like url shorteners, in case restart chain."""
1251 class HeadRequest(compat_urllib_request.Request):
1252 def get_method(self):
1255 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1257 Subclass the HTTPRedirectHandler to make it use our
1258 HeadRequest also on the redirected URL
1260 def redirect_request(self, req, fp, code, msg, headers, newurl):
1261 if code in (301, 302, 303, 307):
1262 newurl = newurl.replace(' ', '%20')
1263 newheaders = dict((k,v) for k,v in req.headers.items()
1264 if k.lower() not in ("content-length", "content-type"))
1265 return HeadRequest(newurl,
1267 origin_req_host=req.get_origin_req_host(),
1270 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1272 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1274 Fallback to GET if HEAD is not allowed (405 HTTP error)
1276 def http_error_405(self, req, fp, code, msg, headers):
1280 newheaders = dict((k,v) for k,v in req.headers.items()
1281 if k.lower() not in ("content-length", "content-type"))
1282 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1284 origin_req_host=req.get_origin_req_host(),
1288 opener = compat_urllib_request.OpenerDirector()
1289 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1290 HTTPMethodFallback, HEADRedirectHandler,
1291 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1292 opener.add_handler(handler())
1294 response = opener.open(HeadRequest(url))
1295 new_url = response.geturl()
1300 self.report_following_redirect(new_url)
1301 self._downloader.download([new_url])
1304 def _real_extract(self, url):
1305 if self._test_redirect(url): return
1307 video_id = url.split('/')[-1]
1308 request = compat_urllib_request.Request(url)
1310 self.report_download_webpage(video_id)
1311 webpage = compat_urllib_request.urlopen(request).read()
1312 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1313 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1315 except ValueError as err:
1316 # since this is the last-resort InfoExtractor, if
1317 # this error is thrown, it'll be thrown here
1318 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1321 self.report_extraction(video_id)
1322 # Start with something easy: JW Player in SWFObject
1323 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1325 # Broaden the search a little bit
1326 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1328 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1331 # It's possible that one of the regexes
1332 # matched, but returned an empty group:
1333 if mobj.group(1) is None:
1334 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1337 video_url = compat_urllib_parse.unquote(mobj.group(1))
1338 video_id = os.path.basename(video_url)
1340 # here's a fun little line of code for you:
1341 video_extension = os.path.splitext(video_id)[1][1:]
1342 video_id = os.path.splitext(video_id)[0]
1344 # it's tempting to parse this further, but you would
1345 # have to take into account all the variations like
1346 # Video Title - Site Name
1347 # Site Name | Video Title
1348 # Video Title - Tagline | Site Name
1349 # and so on and so forth; it's just not practical
1350 mobj = re.search(r'<title>(.*)</title>', webpage)
1352 self._downloader.trouble(u'ERROR: unable to extract title')
1354 video_title = mobj.group(1)
1356 # video uploader is domain name
1357 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1359 self._downloader.trouble(u'ERROR: unable to extract title')
1361 video_uploader = mobj.group(1)
1366 'uploader': video_uploader,
1367 'upload_date': None,
1368 'title': video_title,
1369 'ext': video_extension,
1373 class YoutubeSearchIE(InfoExtractor):
1374 """Information Extractor for YouTube search queries."""
1375 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1376 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1377 _max_youtube_results = 1000
1378 IE_NAME = u'youtube:search'
1380 def __init__(self, downloader=None):
1381 InfoExtractor.__init__(self, downloader)
1383 def report_download_page(self, query, pagenum):
1384 """Report attempt to download search page with given number."""
1385 query = query.decode(preferredencoding())
1386 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1388 def _real_extract(self, query):
1389 mobj = re.match(self._VALID_URL, query)
1391 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1394 prefix, query = query.split(':')
1396 query = query.encode('utf-8')
1398 self._download_n_results(query, 1)
1400 elif prefix == 'all':
1401 self._download_n_results(query, self._max_youtube_results)
1407 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1409 elif n > self._max_youtube_results:
1410 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1411 n = self._max_youtube_results
1412 self._download_n_results(query, n)
1414 except ValueError: # parsing prefix as integer fails
1415 self._download_n_results(query, 1)
1418 def _download_n_results(self, query, n):
1419 """Downloads a specified number of results for a query"""
1425 while (50 * pagenum) < limit:
1426 self.report_download_page(query, pagenum+1)
1427 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1428 request = compat_urllib_request.Request(result_url)
1430 data = compat_urllib_request.urlopen(request).read()
1431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1432 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1434 api_response = json.loads(data)['data']
1436 new_ids = list(video['id'] for video in api_response['items'])
1437 video_ids += new_ids
1439 limit = min(n, api_response['totalItems'])
1442 if len(video_ids) > n:
1443 video_ids = video_ids[:n]
1444 for id in video_ids:
1445 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1449 class GoogleSearchIE(InfoExtractor):
1450 """Information Extractor for Google Video search queries."""
1451 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1452 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1453 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1454 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1455 _max_google_results = 1000
1456 IE_NAME = u'video.google:search'
1458 def __init__(self, downloader=None):
1459 InfoExtractor.__init__(self, downloader)
1461 def report_download_page(self, query, pagenum):
1462 """Report attempt to download playlist page with given number."""
1463 query = query.decode(preferredencoding())
1464 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1466 def _real_extract(self, query):
1467 mobj = re.match(self._VALID_URL, query)
1469 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1472 prefix, query = query.split(':')
1474 query = query.encode('utf-8')
1476 self._download_n_results(query, 1)
1478 elif prefix == 'all':
1479 self._download_n_results(query, self._max_google_results)
1485 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1487 elif n > self._max_google_results:
1488 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1489 n = self._max_google_results
1490 self._download_n_results(query, n)
1492 except ValueError: # parsing prefix as integer fails
1493 self._download_n_results(query, 1)
1496 def _download_n_results(self, query, n):
1497 """Downloads a specified number of results for a query"""
1503 self.report_download_page(query, pagenum)
1504 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1505 request = compat_urllib_request.Request(result_url)
1507 page = compat_urllib_request.urlopen(request).read()
1508 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1509 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1512 # Extract video identifiers
1513 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1514 video_id = mobj.group(1)
1515 if video_id not in video_ids:
1516 video_ids.append(video_id)
1517 if len(video_ids) == n:
1518 # Specified n videos reached
1519 for id in video_ids:
1520 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1524 for id in video_ids:
1525 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1528 pagenum = pagenum + 1
1531 class YahooSearchIE(InfoExtractor):
1532 """Information Extractor for Yahoo! Video search queries."""
1535 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1536 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1537 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1538 _MORE_PAGES_INDICATOR = r'\s*Next'
1539 _max_yahoo_results = 1000
1540 IE_NAME = u'video.yahoo:search'
1542 def __init__(self, downloader=None):
1543 InfoExtractor.__init__(self, downloader)
1545 def report_download_page(self, query, pagenum):
1546 """Report attempt to download playlist page with given number."""
1547 query = query.decode(preferredencoding())
1548 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1550 def _real_extract(self, query):
1551 mobj = re.match(self._VALID_URL, query)
1553 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1556 prefix, query = query.split(':')
1558 query = query.encode('utf-8')
1560 self._download_n_results(query, 1)
1562 elif prefix == 'all':
1563 self._download_n_results(query, self._max_yahoo_results)
1569 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1571 elif n > self._max_yahoo_results:
1572 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1573 n = self._max_yahoo_results
1574 self._download_n_results(query, n)
1576 except ValueError: # parsing prefix as integer fails
1577 self._download_n_results(query, 1)
1580 def _download_n_results(self, query, n):
1581 """Downloads a specified number of results for a query"""
1584 already_seen = set()
1588 self.report_download_page(query, pagenum)
1589 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1590 request = compat_urllib_request.Request(result_url)
1592 page = compat_urllib_request.urlopen(request).read()
1593 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1594 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1597 # Extract video identifiers
1598 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1599 video_id = mobj.group(1)
1600 if video_id not in already_seen:
1601 video_ids.append(video_id)
1602 already_seen.add(video_id)
1603 if len(video_ids) == n:
1604 # Specified n videos reached
1605 for id in video_ids:
1606 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1609 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1610 for id in video_ids:
1611 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1614 pagenum = pagenum + 1
1617 class YoutubePlaylistIE(InfoExtractor):
1618 """Information Extractor for YouTube playlists."""
1620 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1621 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1622 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1623 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1624 IE_NAME = u'youtube:playlist'
1626 def __init__(self, downloader=None):
1627 InfoExtractor.__init__(self, downloader)
1629 def report_download_page(self, playlist_id, pagenum):
1630 """Report attempt to download playlist page with given number."""
1631 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1633 def _real_extract(self, url):
1634 # Extract playlist id
1635 mobj = re.match(self._VALID_URL, url)
1637 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1641 if mobj.group(3) is not None:
1642 self._downloader.download([mobj.group(3)])
1645 # Download playlist pages
1646 # prefix is 'p' as default for playlists but there are other types that need extra care
1647 playlist_prefix = mobj.group(1)
1648 if playlist_prefix == 'a':
1649 playlist_access = 'artist'
1651 playlist_prefix = 'p'
1652 playlist_access = 'view_play_list'
1653 playlist_id = mobj.group(2)
1658 self.report_download_page(playlist_id, pagenum)
1659 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1660 request = compat_urllib_request.Request(url)
1662 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1664 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1667 # Extract video identifiers
1669 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1670 if mobj.group(1) not in ids_in_page:
1671 ids_in_page.append(mobj.group(1))
1672 video_ids.extend(ids_in_page)
1674 if self._MORE_PAGES_INDICATOR not in page:
1676 pagenum = pagenum + 1
1678 total = len(video_ids)
1680 playliststart = self._downloader.params.get('playliststart', 1) - 1
1681 playlistend = self._downloader.params.get('playlistend', -1)
1682 if playlistend == -1:
1683 video_ids = video_ids[playliststart:]
1685 video_ids = video_ids[playliststart:playlistend]
1687 if len(video_ids) == total:
1688 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1690 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1692 for id in video_ids:
1693 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1697 class YoutubeChannelIE(InfoExtractor):
1698 """Information Extractor for YouTube channels."""
1700 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1701 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1702 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1703 IE_NAME = u'youtube:channel'
1705 def report_download_page(self, channel_id, pagenum):
1706 """Report attempt to download channel page with given number."""
1707 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1709 def _real_extract(self, url):
1710 # Extract channel id
1711 mobj = re.match(self._VALID_URL, url)
1713 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1716 # Download channel pages
1717 channel_id = mobj.group(1)
1722 self.report_download_page(channel_id, pagenum)
1723 url = self._TEMPLATE_URL % (channel_id, pagenum)
1724 request = compat_urllib_request.Request(url)
1726 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1727 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1728 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1731 # Extract video identifiers
1733 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1734 if mobj.group(1) not in ids_in_page:
1735 ids_in_page.append(mobj.group(1))
1736 video_ids.extend(ids_in_page)
1738 if self._MORE_PAGES_INDICATOR not in page:
1740 pagenum = pagenum + 1
1742 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1744 for id in video_ids:
1745 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1749 class YoutubeUserIE(InfoExtractor):
1750 """Information Extractor for YouTube users."""
1752 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1753 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1754 _GDATA_PAGE_SIZE = 50
1755 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1756 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1757 IE_NAME = u'youtube:user'
1759 def __init__(self, downloader=None):
1760 InfoExtractor.__init__(self, downloader)
1762 def report_download_page(self, username, start_index):
1763 """Report attempt to download user page."""
1764 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1765 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1767 def _real_extract(self, url):
1769 mobj = re.match(self._VALID_URL, url)
1771 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1774 username = mobj.group(1)
1776 # Download video ids using YouTube Data API. Result size per
1777 # query is limited (currently to 50 videos) so we need to query
1778 # page by page until there are no video ids - it means we got
1785 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1786 self.report_download_page(username, start_index)
1788 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1791 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1793 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1796 # Extract video identifiers
1799 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1800 if mobj.group(1) not in ids_in_page:
1801 ids_in_page.append(mobj.group(1))
1803 video_ids.extend(ids_in_page)
1805 # A little optimization - if current page is not
1806 # "full", ie. does not contain PAGE_SIZE video ids then
1807 # we can assume that this page is the last one - there
1808 # are no more ids on further pages - no need to query
1811 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1816 all_ids_count = len(video_ids)
1817 playliststart = self._downloader.params.get('playliststart', 1) - 1
1818 playlistend = self._downloader.params.get('playlistend', -1)
1820 if playlistend == -1:
1821 video_ids = video_ids[playliststart:]
1823 video_ids = video_ids[playliststart:playlistend]
1825 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1826 (username, all_ids_count, len(video_ids)))
1828 for video_id in video_ids:
1829 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1832 class BlipTVUserIE(InfoExtractor):
1833 """Information Extractor for blip.tv users."""
1835 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1837 IE_NAME = u'blip.tv:user'
1839 def __init__(self, downloader=None):
1840 InfoExtractor.__init__(self, downloader)
1842 def report_download_page(self, username, pagenum):
1843 """Report attempt to download user page."""
1844 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1845 (self.IE_NAME, username, pagenum))
1847 def _real_extract(self, url):
1849 mobj = re.match(self._VALID_URL, url)
1851 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1854 username = mobj.group(1)
1856 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1858 request = compat_urllib_request.Request(url)
1861 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1862 mobj = re.search(r'data-users-id="([^"]+)"', page)
1863 page_base = page_base % mobj.group(1)
1864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1869 # Download video ids using BlipTV Ajax calls. Result size per
1870 # query is limited (currently to 12 videos) so we need to query
1871 # page by page until there are no video ids - it means we got
1878 self.report_download_page(username, pagenum)
1880 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1883 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1884 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1885 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1888 # Extract video identifiers
1891 for mobj in re.finditer(r'href="/([^"]+)"', page):
1892 if mobj.group(1) not in ids_in_page:
1893 ids_in_page.append(unescapeHTML(mobj.group(1)))
1895 video_ids.extend(ids_in_page)
1897 # A little optimization - if current page is not
1898 # "full", ie. does not contain PAGE_SIZE video ids then
1899 # we can assume that this page is the last one - there
1900 # are no more ids on further pages - no need to query
1903 if len(ids_in_page) < self._PAGE_SIZE:
1908 all_ids_count = len(video_ids)
1909 playliststart = self._downloader.params.get('playliststart', 1) - 1
1910 playlistend = self._downloader.params.get('playlistend', -1)
1912 if playlistend == -1:
1913 video_ids = video_ids[playliststart:]
1915 video_ids = video_ids[playliststart:playlistend]
1917 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1918 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1920 for video_id in video_ids:
1921 self._downloader.download([u'http://blip.tv/'+video_id])
1924 class DepositFilesIE(InfoExtractor):
1925 """Information extractor for depositfiles.com"""
1927 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1929 def report_download_webpage(self, file_id):
1930 """Report webpage download."""
1931 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1933 def report_extraction(self, file_id):
1934 """Report information extraction."""
1935 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1937 def _real_extract(self, url):
1938 file_id = url.split('/')[-1]
1939 # Rebuild url in english locale
1940 url = 'http://depositfiles.com/en/files/' + file_id
1942 # Retrieve file webpage with 'Free download' button pressed
1943 free_download_indication = { 'gateway_result' : '1' }
1944 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1946 self.report_download_webpage(file_id)
1947 webpage = compat_urllib_request.urlopen(request).read()
1948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1949 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1952 # Search for the real file URL
1953 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1954 if (mobj is None) or (mobj.group(1) is None):
1955 # Try to figure out reason of the error.
1956 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1957 if (mobj is not None) and (mobj.group(1) is not None):
1958 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1959 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1961 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1964 file_url = mobj.group(1)
1965 file_extension = os.path.splitext(file_url)[1][1:]
1967 # Search for file title
1968 mobj = re.search(r'<b title="(.*?)">', webpage)
1970 self._downloader.trouble(u'ERROR: unable to extract title')
1972 file_title = mobj.group(1).decode('utf-8')
1975 'id': file_id.decode('utf-8'),
1976 'url': file_url.decode('utf-8'),
1978 'upload_date': None,
1979 'title': file_title,
1980 'ext': file_extension.decode('utf-8'),
1984 class FacebookIE(InfoExtractor):
1985 """Information Extractor for Facebook"""
1988 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1989 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1990 _NETRC_MACHINE = 'facebook'
1991 _available_formats = ['video', 'highqual', 'lowqual']
1992 _video_extensions = {
1997 IE_NAME = u'facebook'
1999 def __init__(self, downloader=None):
2000 InfoExtractor.__init__(self, downloader)
2002 def _reporter(self, message):
2003 """Add header and report message."""
2004 self._downloader.to_screen(u'[facebook] %s' % message)
2006 def report_login(self):
2007 """Report attempt to log in."""
2008 self._reporter(u'Logging in')
2010 def report_video_webpage_download(self, video_id):
2011 """Report attempt to download video webpage."""
2012 self._reporter(u'%s: Downloading video webpage' % video_id)
2014 def report_information_extraction(self, video_id):
2015 """Report attempt to extract video information."""
2016 self._reporter(u'%s: Extracting video information' % video_id)
2018 def _parse_page(self, video_webpage):
2019 """Extract video information from page"""
2021 data = {'title': r'\("video_title", "(.*?)"\)',
2022 'description': r'<div class="datawrap">(.*?)</div>',
2023 'owner': r'\("video_owner_name", "(.*?)"\)',
2024 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2027 for piece in data.keys():
2028 mobj = re.search(data[piece], video_webpage)
2029 if mobj is not None:
2030 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2034 for fmt in self._available_formats:
2035 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2036 if mobj is not None:
2037 # URL is in a Javascript segment inside an escaped Unicode format within
2038 # the generally utf-8 page
2039 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2040 video_info['video_urls'] = video_urls
2044 def _real_initialize(self):
2045 if self._downloader is None:
2050 downloader_params = self._downloader.params
2052 # Attempt to use provided username and password or .netrc data
2053 if downloader_params.get('username', None) is not None:
2054 useremail = downloader_params['username']
2055 password = downloader_params['password']
2056 elif downloader_params.get('usenetrc', False):
2058 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2059 if info is not None:
2063 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2064 except (IOError, netrc.NetrcParseError) as err:
2065 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2068 if useremail is None:
2077 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2080 login_results = compat_urllib_request.urlopen(request).read()
2081 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2082 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2084 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2085 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2088 def _real_extract(self, url):
2089 mobj = re.match(self._VALID_URL, url)
2091 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2093 video_id = mobj.group('ID')
2096 self.report_video_webpage_download(video_id)
2097 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2099 page = compat_urllib_request.urlopen(request)
2100 video_webpage = page.read()
2101 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2102 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2105 # Start extracting information
2106 self.report_information_extraction(video_id)
2108 # Extract information
2109 video_info = self._parse_page(video_webpage)
2112 if 'owner' not in video_info:
2113 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2115 video_uploader = video_info['owner']
2118 if 'title' not in video_info:
2119 self._downloader.trouble(u'ERROR: unable to extract video title')
2121 video_title = video_info['title']
2122 video_title = video_title.decode('utf-8')
2125 if 'thumbnail' not in video_info:
2126 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2127 video_thumbnail = ''
2129 video_thumbnail = video_info['thumbnail']
2133 if 'upload_date' in video_info:
2134 upload_time = video_info['upload_date']
2135 timetuple = email.utils.parsedate_tz(upload_time)
2136 if timetuple is not None:
2138 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2143 video_description = video_info.get('description', 'No description available.')
2145 url_map = video_info['video_urls']
2147 # Decide which formats to download
2148 req_format = self._downloader.params.get('format', None)
2149 format_limit = self._downloader.params.get('format_limit', None)
2151 if format_limit is not None and format_limit in self._available_formats:
2152 format_list = self._available_formats[self._available_formats.index(format_limit):]
2154 format_list = self._available_formats
2155 existing_formats = [x for x in format_list if x in url_map]
2156 if len(existing_formats) == 0:
2157 self._downloader.trouble(u'ERROR: no known formats available for video')
2159 if req_format is None:
2160 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2161 elif req_format == 'worst':
2162 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2163 elif req_format == '-1':
2164 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2167 if req_format not in url_map:
2168 self._downloader.trouble(u'ERROR: requested format not available')
2170 video_url_list = [(req_format, url_map[req_format])] # Specific format
2173 for format_param, video_real_url in video_url_list:
2175 video_extension = self._video_extensions.get(format_param, 'mp4')
2178 'id': video_id.decode('utf-8'),
2179 'url': video_real_url.decode('utf-8'),
2180 'uploader': video_uploader.decode('utf-8'),
2181 'upload_date': upload_date,
2182 'title': video_title,
2183 'ext': video_extension.decode('utf-8'),
2184 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2185 'thumbnail': video_thumbnail.decode('utf-8'),
2186 'description': video_description.decode('utf-8'),
2190 class BlipTVIE(InfoExtractor):
2191 """Information extractor for blip.tv"""
2193 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2194 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2195 IE_NAME = u'blip.tv'
2197 def report_extraction(self, file_id):
2198 """Report information extraction."""
2199 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2201 def report_direct_download(self, title):
2202 """Report information extraction."""
2203 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2205 def _real_extract(self, url):
2206 mobj = re.match(self._VALID_URL, url)
2208 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2215 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2216 request = compat_urllib_request.Request(json_url)
2217 self.report_extraction(mobj.group(1))
2220 urlh = compat_urllib_request.urlopen(request)
2221 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2222 basename = url.split('/')[-1]
2223 title,ext = os.path.splitext(basename)
2224 title = title.decode('UTF-8')
2225 ext = ext.replace('.', '')
2226 self.report_direct_download(title)
2231 'upload_date': None,
2236 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2237 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2239 if info is None: # Regular URL
2241 json_code_bytes = urlh.read()
2242 json_code = json_code_bytes.decode('utf-8')
2243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2244 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2248 json_data = json.loads(json_code)
2249 if 'Post' in json_data:
2250 data = json_data['Post']
2254 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2255 video_url = data['media']['url']
2256 umobj = re.match(self._URL_EXT, video_url)
2258 raise ValueError('Can not determine filename extension')
2259 ext = umobj.group(1)
2262 'id': data['item_id'],
2264 'uploader': data['display_name'],
2265 'upload_date': upload_date,
2266 'title': data['title'],
2268 'format': data['media']['mimeType'],
2269 'thumbnail': data['thumbnailUrl'],
2270 'description': data['description'],
2271 'player_url': data['embedUrl']
2273 except (ValueError,KeyError) as err:
2274 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2277 std_headers['User-Agent'] = 'iTunes/10.6.1'
2281 class MyVideoIE(InfoExtractor):
2282 """Information Extractor for myvideo.de."""
2284 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2285 IE_NAME = u'myvideo'
2287 def __init__(self, downloader=None):
2288 InfoExtractor.__init__(self, downloader)
2290 def report_extraction(self, video_id):
2291 """Report information extraction."""
2292 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2294 def _real_extract(self,url):
2295 mobj = re.match(self._VALID_URL, url)
2297 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2300 video_id = mobj.group(1)
2303 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2304 webpage = self._download_webpage(webpage_url, video_id)
2306 self.report_extraction(video_id)
2307 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2310 self._downloader.trouble(u'ERROR: unable to extract media URL')
2312 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2314 mobj = re.search('<title>([^<]+)</title>', webpage)
2316 self._downloader.trouble(u'ERROR: unable to extract title')
2319 video_title = mobj.group(1)
2325 'upload_date': None,
2326 'title': video_title,
2330 class ComedyCentralIE(InfoExtractor):
2331 """Information extractor for The Daily Show and Colbert Report """
2333 # urls can be abbreviations like :thedailyshow or :colbert
2334 # urls for episodes like:
2335 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2336 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2337 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2338 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2339 |(https?://)?(www\.)?
2340 (?P<showname>thedailyshow|colbertnation)\.com/
2341 (full-episodes/(?P<episode>.*)|
2343 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2344 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2346 IE_NAME = u'comedycentral'
2348 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2350 _video_extensions = {
2358 _video_dimensions = {
2367 def suitable(self, url):
2368 """Receives a URL and returns True if suitable for this IE."""
2369 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2371 def report_extraction(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2374 def report_config_download(self, episode_id):
2375 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2377 def report_index_download(self, episode_id):
2378 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2380 def report_player_url(self, episode_id):
2381 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2384 def _print_formats(self, formats):
2385 print('Available formats:')
2387 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2390 def _real_extract(self, url):
2391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2396 if mobj.group('shortname'):
2397 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2398 url = u'http://www.thedailyshow.com/full-episodes/'
2400 url = u'http://www.colbertnation.com/full-episodes/'
2401 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2402 assert mobj is not None
2404 if mobj.group('clip'):
2405 if mobj.group('showname') == 'thedailyshow':
2406 epTitle = mobj.group('tdstitle')
2408 epTitle = mobj.group('cntitle')
2411 dlNewest = not mobj.group('episode')
2413 epTitle = mobj.group('showname')
2415 epTitle = mobj.group('episode')
2417 req = compat_urllib_request.Request(url)
2418 self.report_extraction(epTitle)
2420 htmlHandle = compat_urllib_request.urlopen(req)
2421 html = htmlHandle.read()
2422 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2423 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2426 url = htmlHandle.geturl()
2427 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2429 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2431 if mobj.group('episode') == '':
2432 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2434 epTitle = mobj.group('episode')
2436 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2438 if len(mMovieParams) == 0:
2439 # The Colbert Report embeds the information in a without
2440 # a URL prefix; so extract the alternate reference
2441 # and then add the URL prefix manually.
2443 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2444 if len(altMovieParams) == 0:
2445 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2448 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2450 playerUrl_raw = mMovieParams[0][0]
2451 self.report_player_url(epTitle)
2453 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2454 playerUrl = urlHandle.geturl()
2455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2456 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2459 uri = mMovieParams[0][1]
2460 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2461 self.report_index_download(epTitle)
2463 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2464 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2470 idoc = xml.etree.ElementTree.fromstring(indexXml)
2471 itemEls = idoc.findall('.//item')
2472 for itemEl in itemEls:
2473 mediaId = itemEl.findall('./guid')[0].text
2474 shortMediaId = mediaId.split(':')[-1]
2475 showId = mediaId.split(':')[-2].replace('.com', '')
2476 officialTitle = itemEl.findall('./title')[0].text
2477 officialDate = itemEl.findall('./pubDate')[0].text
2479 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2480 compat_urllib_parse.urlencode({'uri': mediaId}))
2481 configReq = compat_urllib_request.Request(configUrl)
2482 self.report_config_download(epTitle)
2484 configXml = compat_urllib_request.urlopen(configReq).read()
2485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2489 cdoc = xml.etree.ElementTree.fromstring(configXml)
2491 for rendition in cdoc.findall('.//rendition'):
2492 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2496 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2499 if self._downloader.params.get('listformats', None):
2500 self._print_formats([i[0] for i in turls])
2503 # For now, just pick the highest bitrate
2504 format,video_url = turls[-1]
2506 # Get the format arg from the arg stream
2507 req_format = self._downloader.params.get('format', None)
2509 # Select format if we can find one
2512 format, video_url = f, v
2515 # Patch to download from alternative CDN, which does not
2516 # break on current RTMPDump builds
2517 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2518 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2520 if video_url.startswith(broken_cdn):
2521 video_url = video_url.replace(broken_cdn, better_cdn)
2523 effTitle = showId + u'-' + epTitle
2528 'upload_date': officialDate,
2533 'description': officialTitle,
2534 'player_url': None #playerUrl
2537 results.append(info)
2542 class EscapistIE(InfoExtractor):
2543 """Information extractor for The Escapist """
2545 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2546 IE_NAME = u'escapist'
2548 def report_extraction(self, showName):
2549 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2551 def report_config_download(self, showName):
2552 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2554 def _real_extract(self, url):
2555 mobj = re.match(self._VALID_URL, url)
2557 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2559 showName = mobj.group('showname')
2560 videoId = mobj.group('episode')
2562 self.report_extraction(showName)
2564 webPage = compat_urllib_request.urlopen(url)
2565 webPageBytes = webPage.read()
2566 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2567 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2568 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2572 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2573 description = unescapeHTML(descMatch.group(1))
2574 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2575 imgUrl = unescapeHTML(imgMatch.group(1))
2576 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2577 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2578 configUrlMatch = re.search('config=(.*)$', playerUrl)
2579 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2581 self.report_config_download(showName)
2583 configJSON = compat_urllib_request.urlopen(configUrl)
2584 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2585 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2590 # Technically, it's JavaScript, not JSON
2591 configJSON = configJSON.replace("'", '"')
2594 config = json.loads(configJSON)
2595 except (ValueError,) as err:
2596 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2599 playlist = config['playlist']
2600 videoUrl = playlist[1]['url']
2605 'uploader': showName,
2606 'upload_date': None,
2609 'thumbnail': imgUrl,
2610 'description': description,
2611 'player_url': playerUrl,
2617 class CollegeHumorIE(InfoExtractor):
2618 """Information extractor for collegehumor.com"""
2621 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2622 IE_NAME = u'collegehumor'
2624 def report_manifest(self, video_id):
2625 """Report information extraction."""
2626 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2628 def report_extraction(self, video_id):
2629 """Report information extraction."""
2630 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2632 def _real_extract(self, url):
2633 mobj = re.match(self._VALID_URL, url)
2635 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2637 video_id = mobj.group('videoid')
2642 'upload_date': None,
2645 self.report_extraction(video_id)
2646 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2648 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2653 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2655 videoNode = mdoc.findall('./video')[0]
2656 info['description'] = videoNode.findall('./description')[0].text
2657 info['title'] = videoNode.findall('./caption')[0].text
2658 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2659 manifest_url = videoNode.findall('./file')[0].text
2661 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2664 manifest_url += '?hdcore=2.10.3'
2665 self.report_manifest(video_id)
2667 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2672 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2674 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2675 node_id = media_node.attrib['url']
2676 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2677 except IndexError as err:
2678 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2681 url_pr = compat_urllib_parse_urlparse(manifest_url)
2682 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2689 class XVideosIE(InfoExtractor):
2690 """Information extractor for xvideos.com"""
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2693 IE_NAME = u'xvideos'
2695 def report_extraction(self, video_id):
2696 """Report information extraction."""
2697 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2704 video_id = mobj.group(1)
2706 webpage = self._download_webpage(url, video_id)
2708 self.report_extraction(video_id)
2712 mobj = re.search(r'flv_url=(.+?)&', webpage)
2714 self._downloader.trouble(u'ERROR: unable to extract video url')
2716 video_url = compat_urllib_parse.unquote(mobj.group(1))
2720 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2722 self._downloader.trouble(u'ERROR: unable to extract video title')
2724 video_title = mobj.group(1)
2727 # Extract video thumbnail
2728 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2730 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2732 video_thumbnail = mobj.group(0)
2738 'upload_date': None,
2739 'title': video_title,
2741 'thumbnail': video_thumbnail,
2742 'description': None,
2748 class SoundcloudIE(InfoExtractor):
2749 """Information extractor for soundcloud.com
2750 To access the media, the uid of the song and a stream token
2751 must be extracted from the page source and the script must make
2752 a request to media.soundcloud.com/crossdomain.xml. Then
2753 the media can be grabbed by requesting from an url composed
2754 of the stream token and uid
2757 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2758 IE_NAME = u'soundcloud'
2760 def __init__(self, downloader=None):
2761 InfoExtractor.__init__(self, downloader)
2763 def report_resolve(self, video_id):
2764 """Report information extraction."""
2765 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2767 def report_extraction(self, video_id):
2768 """Report information extraction."""
2769 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2771 def _real_extract(self, url):
2772 mobj = re.match(self._VALID_URL, url)
2774 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2777 # extract uploader (which is in the url)
2778 uploader = mobj.group(1)
2779 # extract simple title (uploader + slug of song title)
2780 slug_title = mobj.group(2)
2781 simple_title = uploader + u'-' + slug_title
2783 self.report_resolve('%s/%s' % (uploader, slug_title))
2785 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2786 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2787 request = compat_urllib_request.Request(resolv_url)
2789 info_json_bytes = compat_urllib_request.urlopen(request).read()
2790 info_json = info_json_bytes.decode('utf-8')
2791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2795 info = json.loads(info_json)
2796 video_id = info['id']
2797 self.report_extraction('%s/%s' % (uploader, slug_title))
2799 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2800 request = compat_urllib_request.Request(streams_url)
2802 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2803 stream_json = stream_json_bytes.decode('utf-8')
2804 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2805 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2808 streams = json.loads(stream_json)
2809 mediaURL = streams['http_mp3_128_url']
2814 'uploader': info['user']['username'],
2815 'upload_date': info['created_at'],
2816 'title': info['title'],
2818 'description': info['description'],
2822 class InfoQIE(InfoExtractor):
2823 """Information extractor for infoq.com"""
2824 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2826 def report_extraction(self, video_id):
2827 """Report information extraction."""
2828 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2830 def _real_extract(self, url):
2831 mobj = re.match(self._VALID_URL, url)
2833 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2836 webpage = self._download_webpage(url, video_id=url)
2837 self.report_extraction(url)
2840 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2842 self._downloader.trouble(u'ERROR: unable to extract video url')
2844 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2845 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2848 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2850 self._downloader.trouble(u'ERROR: unable to extract video title')
2852 video_title = mobj.group(1)
2854 # Extract description
2855 video_description = u'No description available.'
2856 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2857 if mobj is not None:
2858 video_description = mobj.group(1)
2860 video_filename = video_url.split('/')[-1]
2861 video_id, extension = video_filename.split('.')
2867 'upload_date': None,
2868 'title': video_title,
2869 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2871 'description': video_description,
2876 class MixcloudIE(InfoExtractor):
2877 """Information extractor for www.mixcloud.com"""
2879 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2880 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2881 IE_NAME = u'mixcloud'
2883 def __init__(self, downloader=None):
2884 InfoExtractor.__init__(self, downloader)
2886 def report_download_json(self, file_id):
2887 """Report JSON download."""
2888 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2890 def report_extraction(self, file_id):
2891 """Report information extraction."""
2892 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2894 def get_urls(self, jsonData, fmt, bitrate='best'):
2895 """Get urls from 'audio_formats' section in json"""
2898 bitrate_list = jsonData[fmt]
2899 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2900 bitrate = max(bitrate_list) # select highest
2902 url_list = jsonData[fmt][bitrate]
2903 except TypeError: # we have no bitrate info.
2904 url_list = jsonData[fmt]
2907 def check_urls(self, url_list):
2908 """Returns 1st active url from list"""
2909 for url in url_list:
2911 compat_urllib_request.urlopen(url)
2913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2918 def _print_formats(self, formats):
2919 print('Available formats:')
2920 for fmt in formats.keys():
2921 for b in formats[fmt]:
2923 ext = formats[fmt][b][0]
2924 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2925 except TypeError: # we have no bitrate info
2926 ext = formats[fmt][0]
2927 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2930 def _real_extract(self, url):
2931 mobj = re.match(self._VALID_URL, url)
2933 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2935 # extract uploader & filename from url
2936 uploader = mobj.group(1).decode('utf-8')
2937 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2939 # construct API request
2940 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2941 # retrieve .json file with links to files
2942 request = compat_urllib_request.Request(file_url)
2944 self.report_download_json(file_url)
2945 jsonData = compat_urllib_request.urlopen(request).read()
2946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2951 json_data = json.loads(jsonData)
2952 player_url = json_data['player_swf_url']
2953 formats = dict(json_data['audio_formats'])
2955 req_format = self._downloader.params.get('format', None)
2958 if self._downloader.params.get('listformats', None):
2959 self._print_formats(formats)
2962 if req_format is None or req_format == 'best':
2963 for format_param in formats.keys():
2964 url_list = self.get_urls(formats, format_param)
2966 file_url = self.check_urls(url_list)
2967 if file_url is not None:
2970 if req_format not in formats:
2971 self._downloader.trouble(u'ERROR: format is not available')
2974 url_list = self.get_urls(formats, req_format)
2975 file_url = self.check_urls(url_list)
2976 format_param = req_format
2979 'id': file_id.decode('utf-8'),
2980 'url': file_url.decode('utf-8'),
2981 'uploader': uploader.decode('utf-8'),
2982 'upload_date': None,
2983 'title': json_data['name'],
2984 'ext': file_url.split('.')[-1].decode('utf-8'),
2985 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2986 'thumbnail': json_data['thumbnail_url'],
2987 'description': json_data['description'],
2988 'player_url': player_url.decode('utf-8'),
2991 class StanfordOpenClassroomIE(InfoExtractor):
2992 """Information extractor for Stanford's Open ClassRoom"""
2994 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2995 IE_NAME = u'stanfordoc'
2997 def report_download_webpage(self, objid):
2998 """Report information extraction."""
2999 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3001 def report_extraction(self, video_id):
3002 """Report information extraction."""
3003 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3005 def _real_extract(self, url):
3006 mobj = re.match(self._VALID_URL, url)
3008 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3011 if mobj.group('course') and mobj.group('video'): # A specific video
3012 course = mobj.group('course')
3013 video = mobj.group('video')
3015 'id': course + '_' + video,
3017 'upload_date': None,
3020 self.report_extraction(info['id'])
3021 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3022 xmlUrl = baseUrl + video + '.xml'
3024 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3025 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3026 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3028 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3030 info['title'] = mdoc.findall('./title')[0].text
3031 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3033 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3035 info['ext'] = info['url'].rpartition('.')[2]
3037 elif mobj.group('course'): # A course page
3038 course = mobj.group('course')
3043 'upload_date': None,
3046 self.report_download_webpage(info['id'])
3048 coursepage = compat_urllib_request.urlopen(url).read()
3049 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3050 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3053 m = re.search('<h1>([^<]+)</h1>', coursepage)
3055 info['title'] = unescapeHTML(m.group(1))
3057 info['title'] = info['id']
3059 m = re.search('<description>([^<]+)</description>', coursepage)
3061 info['description'] = unescapeHTML(m.group(1))
3063 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3066 'type': 'reference',
3067 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3071 for entry in info['list']:
3072 assert entry['type'] == 'reference'
3073 results += self.extract(entry['url'])
3078 'id': 'Stanford OpenClassroom',
3081 'upload_date': None,
3084 self.report_download_webpage(info['id'])
3085 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3087 rootpage = compat_urllib_request.urlopen(rootURL).read()
3088 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3089 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3092 info['title'] = info['id']
3094 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3097 'type': 'reference',
3098 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3103 for entry in info['list']:
3104 assert entry['type'] == 'reference'
3105 results += self.extract(entry['url'])
3108 class MTVIE(InfoExtractor):
3109 """Information extractor for MTV.com"""
3111 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3114 def report_extraction(self, video_id):
3115 """Report information extraction."""
3116 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3118 def _real_extract(self, url):
3119 mobj = re.match(self._VALID_URL, url)
3121 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3123 if not mobj.group('proto'):
3124 url = 'http://' + url
3125 video_id = mobj.group('videoid')
3127 webpage = self._download_webpage(url, video_id)
3129 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3131 self._downloader.trouble(u'ERROR: unable to extract song name')
3133 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3134 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3136 self._downloader.trouble(u'ERROR: unable to extract performer')
3138 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3139 video_title = performer + ' - ' + song_name
3141 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3143 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3145 mtvn_uri = mobj.group(1)
3147 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3149 self._downloader.trouble(u'ERROR: unable to extract content id')
3151 content_id = mobj.group(1)
3153 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3154 self.report_extraction(video_id)
3155 request = compat_urllib_request.Request(videogen_url)
3157 metadataXml = compat_urllib_request.urlopen(request).read()
3158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3162 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3163 renditions = mdoc.findall('.//rendition')
3165 # For now, always pick the highest quality.
3166 rendition = renditions[-1]
3169 _,_,ext = rendition.attrib['type'].partition('/')
3170 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3171 video_url = rendition.find('./src').text
3173 self._downloader.trouble('Invalid rendition field.')
3179 'uploader': performer,
3180 'upload_date': None,
3181 'title': video_title,
3189 class YoukuIE(InfoExtractor):
3190 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3192 def report_download_webpage(self, file_id):
3193 """Report webpage download."""
3194 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3196 def report_extraction(self, file_id):
3197 """Report information extraction."""
3198 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3201 nowTime = int(time.time() * 1000)
3202 random1 = random.randint(1000,1998)
3203 random2 = random.randint(1000,9999)
3205 return "%d%d%d" %(nowTime,random1,random2)
3207 def _get_file_ID_mix_string(self, seed):
3209 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3211 for i in range(len(source)):
3212 seed = (seed * 211 + 30031 ) % 65536
3213 index = math.floor(seed / 65536 * len(source) )
3214 mixed.append(source[int(index)])
3215 source.remove(source[int(index)])
3216 #return ''.join(mixed)
3219 def _get_file_id(self, fileId, seed):
3220 mixed = self._get_file_ID_mix_string(seed)
3221 ids = fileId.split('*')
3225 realId.append(mixed[int(ch)])
3226 return ''.join(realId)
3228 def _real_extract(self, url):
3229 mobj = re.match(self._VALID_URL, url)
3231 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3233 video_id = mobj.group('ID')
3235 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3237 request = compat_urllib_request.Request(info_url, None, std_headers)
3239 self.report_download_webpage(video_id)
3240 jsondata = compat_urllib_request.urlopen(request).read()
3241 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3242 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3245 self.report_extraction(video_id)
3247 jsonstr = jsondata.decode('utf-8')
3248 config = json.loads(jsonstr)
3250 video_title = config['data'][0]['title']
3251 seed = config['data'][0]['seed']
3253 format = self._downloader.params.get('format', None)
3254 supported_format = list(config['data'][0]['streamfileids'].keys())
3256 if format is None or format == 'best':
3257 if 'hd2' in supported_format:
3262 elif format == 'worst':
3270 fileid = config['data'][0]['streamfileids'][format]
3271 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3272 except (UnicodeDecodeError, ValueError, KeyError):
3273 self._downloader.trouble(u'ERROR: unable to extract info section')
3277 sid = self._gen_sid()
3278 fileid = self._get_file_id(fileid, seed)
3280 #column 8,9 of fileid represent the segment number
3281 #fileid[7:9] should be changed
3282 for index, key in enumerate(keys):
3284 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3285 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3288 'id': '%s_part%02d' % (video_id, index),
3289 'url': download_url,
3291 'upload_date': None,
3292 'title': video_title,
3295 files_info.append(info)
3300 class XNXXIE(InfoExtractor):
3301 """Information extractor for xnxx.com"""
3303 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3305 VIDEO_URL_RE = r'flv_url=(.*?)&'
3306 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3307 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3309 def report_webpage(self, video_id):
3310 """Report information extraction"""
3311 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3313 def report_extraction(self, video_id):
3314 """Report information extraction"""
3315 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3317 def _real_extract(self, url):
3318 mobj = re.match(self._VALID_URL, url)
3320 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3322 video_id = mobj.group(1)
3324 self.report_webpage(video_id)
3326 # Get webpage content
3328 webpage_bytes = compat_urllib_request.urlopen(url).read()
3329 webpage = webpage_bytes.decode('utf-8')
3330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3331 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3334 result = re.search(self.VIDEO_URL_RE, webpage)
3336 self._downloader.trouble(u'ERROR: unable to extract video url')
3338 video_url = compat_urllib_parse.unquote(result.group(1))
3340 result = re.search(self.VIDEO_TITLE_RE, webpage)
3342 self._downloader.trouble(u'ERROR: unable to extract video title')
3344 video_title = result.group(1)
3346 result = re.search(self.VIDEO_THUMB_RE, webpage)
3348 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3350 video_thumbnail = result.group(1)
3356 'upload_date': None,
3357 'title': video_title,
3359 'thumbnail': video_thumbnail,
3360 'description': None,
3364 class GooglePlusIE(InfoExtractor):
3365 """Information extractor for plus.google.com."""
3367 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3368 IE_NAME = u'plus.google'
3370 def __init__(self, downloader=None):
3371 InfoExtractor.__init__(self, downloader)
3373 def report_extract_entry(self, url):
3374 """Report downloading extry"""
3375 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3377 def report_date(self, upload_date):
3378 """Report downloading extry"""
3379 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3381 def report_uploader(self, uploader):
3382 """Report downloading extry"""
3383 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3385 def report_title(self, video_title):
3386 """Report downloading extry"""
3387 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3389 def report_extract_vid_page(self, video_page):
3390 """Report information extraction."""
3391 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3393 def _real_extract(self, url):
3394 # Extract id from URL
3395 mobj = re.match(self._VALID_URL, url)
3397 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3400 post_url = mobj.group(0)
3401 video_id = mobj.group(1)
3403 video_extension = 'flv'
3405 # Step 1, Retrieve post webpage to extract further information
3406 self.report_extract_entry(post_url)
3407 request = compat_urllib_request.Request(post_url)
3409 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3410 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3411 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3414 # Extract update date
3416 pattern = 'title="Timestamp">(.*?)</a>'
3417 mobj = re.search(pattern, webpage)
3419 upload_date = mobj.group(1)
3420 # Convert timestring to a format suitable for filename
3421 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3422 upload_date = upload_date.strftime('%Y%m%d')
3423 self.report_date(upload_date)
3427 pattern = r'rel\="author".*?>(.*?)</a>'
3428 mobj = re.search(pattern, webpage)
3430 uploader = mobj.group(1)
3431 self.report_uploader(uploader)
3434 # Get the first line for title
3436 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3437 mobj = re.search(pattern, webpage)
3439 video_title = mobj.group(1)
3440 self.report_title(video_title)
3442 # Step 2, Stimulate clicking the image box to launch video
3443 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3444 mobj = re.search(pattern, webpage)
3446 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3448 video_page = mobj.group(1)
3449 request = compat_urllib_request.Request(video_page)
3451 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3453 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3455 self.report_extract_vid_page(video_page)
3458 # Extract video links on video page
3459 """Extract video links of all sizes"""
3460 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3461 mobj = re.findall(pattern, webpage)
3463 self._downloader.trouble(u'ERROR: unable to extract video links')
3465 # Sort in resolution
3466 links = sorted(mobj)
3468 # Choose the lowest of the sort, i.e. highest resolution
3469 video_url = links[-1]
3470 # Only get the url. The resolution part in the tuple has no use anymore
3471 video_url = video_url[-1]
3472 # Treat escaped \u0026 style hex
3474 video_url = video_url.decode("unicode_escape")
3475 except AttributeError: # Python 3
3476 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3482 'uploader': uploader,
3483 'upload_date': upload_date,
3484 'title': video_title,
3485 'ext': video_extension,
3488 class NBAIE(InfoExtractor):
3489 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3492 def _real_extract(self, url):
3493 mobj = re.match(self._VALID_URL, url)
3495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3498 video_id = mobj.group(1)
3499 if video_id.endswith('/index.html'):
3500 video_id = video_id[:-len('/index.html')]
3502 webpage = self._download_webpage(url, video_id)
3504 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3505 def _findProp(rexp, default=None):
3506 m = re.search(rexp, webpage)
3508 return unescapeHTML(m.group(1))
3512 shortened_video_id = video_id.rpartition('/')[2]
3513 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3515 'id': shortened_video_id,
3519 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3520 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3524 class JustinTVIE(InfoExtractor):
3525 """Information extractor for justin.tv and twitch.tv"""
3526 # TODO: One broadcast may be split into multiple videos. The key
3527 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3528 # starts at 1 and increases. Can we treat all parts as one video?
3530 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3531 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3532 _JUSTIN_PAGE_LIMIT = 100
3533 IE_NAME = u'justin.tv'
3535 def report_extraction(self, file_id):
3536 """Report information extraction."""
3537 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3539 def report_download_page(self, channel, offset):
3540 """Report attempt to download a single page of videos."""
3541 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3542 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3544 # Return count of items, list of *valid* items
3545 def _parse_page(self, url):
3547 urlh = compat_urllib_request.urlopen(url)
3548 webpage_bytes = urlh.read()
3549 webpage = webpage_bytes.decode('utf-8', 'ignore')
3550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3551 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3554 response = json.loads(webpage)
3556 for clip in response:
3557 video_url = clip['video_file_url']
3559 video_extension = os.path.splitext(video_url)[1][1:]
3560 video_date = re.sub('-', '', clip['created_on'][:10])
3564 'title': clip['title'],
3565 'uploader': clip.get('user_id', clip.get('channel_id')),
3566 'upload_date': video_date,
3567 'ext': video_extension,
3569 return (len(response), info)
3571 def _real_extract(self, url):
3572 mobj = re.match(self._VALID_URL, url)
3574 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3577 api = 'http://api.justin.tv'
3578 video_id = mobj.group(mobj.lastindex)
3580 if mobj.lastindex == 1:
3582 api += '/channel/archives/%s.json'
3584 api += '/clip/show/%s.json'
3585 api = api % (video_id,)
3587 self.report_extraction(video_id)
3591 limit = self._JUSTIN_PAGE_LIMIT
3594 self.report_download_page(video_id, offset)
3595 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3596 page_count, page_info = self._parse_page(page_url)
3597 info.extend(page_info)
3598 if not paged or page_count != limit:
3603 class FunnyOrDieIE(InfoExtractor):
3604 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3606 def _real_extract(self, url):
3607 mobj = re.match(self._VALID_URL, url)
3609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3612 video_id = mobj.group('id')
3613 webpage = self._download_webpage(url, video_id)
3615 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3617 self._downloader.trouble(u'ERROR: unable to find video information')
3618 video_url = unescapeHTML(m.group('url'))
3620 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3622 self._downloader.trouble(u'Cannot find video title')
3623 title = unescapeHTML(m.group('title'))
3625 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3627 desc = unescapeHTML(m.group('desc'))
3636 'description': desc,
3640 class TweetReelIE(InfoExtractor):
3641 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3643 def _real_extract(self, url):
3644 mobj = re.match(self._VALID_URL, url)
3646 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3649 video_id = mobj.group('id')
3650 webpage = self._download_webpage(url, video_id)
3652 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3654 self._downloader.trouble(u'ERROR: Cannot find status ID')
3655 status_id = m.group(1)
3657 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3659 self._downloader.trouble(u'WARNING: Cannot find description')
3660 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3662 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3664 self._downloader.trouble(u'ERROR: Cannot find uploader')
3665 uploader = unescapeHTML(m.group('uploader'))
3666 uploader_id = unescapeHTML(m.group('uploader_id'))
3668 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3670 self._downloader.trouble(u'ERROR: Cannot find upload date')
3671 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3674 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3681 'description': desc,
3682 'uploader': uploader,
3683 'uploader_id': uploader_id,
3684 'internal_id': status_id,
3685 'upload_date': upload_date
3689 class SteamIE(InfoExtractor):
3690 _VALID_URL = r"""http://store.steampowered.com/
3691 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3693 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3696 def suitable(self, url):
3697 """Receives a URL and returns True if suitable for this IE."""
3698 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3700 def _real_extract(self, url):
3701 m = re.match(self._VALID_URL, url, re.VERBOSE)
3702 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3703 gameID = m.group('gameID')
3704 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3705 webpage = self._download_webpage(videourl, gameID)
3706 mweb = re.finditer(urlRE, webpage)
3707 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3708 titles = list(re.finditer(namesRE, webpage))
3710 for vid,vtitle in zip(mweb,titles):
3711 video_id = vid.group('videoID')
3712 title = vtitle.group('videoName')
3713 video_url = vid.group('videoURL')
3715 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3725 class UstreamIE(InfoExtractor):
3726 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3727 IE_NAME = u'ustream'
3729 def _real_extract(self, url):
3730 m = re.match(self._VALID_URL, url)
3731 video_id = m.group('videoID')
3732 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3733 webpage = self._download_webpage(url, video_id)
3734 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3735 title = m.group('title')
3736 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3737 uploader = m.group('uploader')
3743 'uploader': uploader
3748 def gen_extractors():
3749 """ Return a list of an instance of every supported extractor.
3750 The order does matter; the first extractor matched is the one handling the URL.
3753 YoutubePlaylistIE(),
3777 StanfordOpenClassroomIE(),