2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
323 request = compat_urllib_request.Request(self._LOGIN_URL)
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
334 galx = match.group(1)
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
346 u'PersistentCookie': u'yes',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
356 u'signIn': u'Sign in',
358 u'service': u'youtube',
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
380 'action_confirm': 'Confirm',
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
384 self.report_age_confirmation()
385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
390 def _extract_id(self, url):
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
395 video_id = mobj.group(2)
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
406 self.report_video_webpage_download(video_id)
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
451 # Start extracting information
452 self.report_information_extraction(video_id)
455 if 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
461 video_uploader_id = None
462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
464 video_uploader_id = mobj.group(1)
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
498 video_description = ''
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
505 self._downloader.trouble(srt_error)
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
554 video_url_list = [(rf, url_map[rf])]
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
564 for format_param, video_real_url in video_url_list:
566 video_extension = self._video_extensions.get(format_param, 'flv')
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
573 'url': video_real_url,
574 'uploader': video_uploader,
575 'uploader_id': video_uploader_id,
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
589 class MetacafeIE(InfoExtractor):
590 """Information Extractor for metacafe.com."""
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
629 'submit': "Continue - I'm over 18",
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 video_id = mobj.group(1)
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
696 self._downloader.trouble(u'ERROR: unable to extract title')
698 video_title = mobj.group(1).decode('utf-8')
700 mobj = re.search(r'submitter=(.*?);', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
704 video_uploader = mobj.group(1)
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
716 class DailymotionIE(InfoExtractor):
717 """Information Extractor for Dailymotion"""
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
723 def __init__(self, downloader=None):
724 InfoExtractor.__init__(self, downloader)
726 def report_extraction(self, video_id):
727 """Report information extraction."""
728 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
730 def _real_extract(self, url):
731 # Extract id and simplified title from URL
732 mobj = re.match(self._VALID_URL, url)
734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
737 video_id = mobj.group(1).split('_')[0].split('?')[0]
739 video_extension = 'mp4'
741 # Retrieve video webpage to extract further information
742 request = compat_urllib_request.Request(url)
743 request.add_header('Cookie', 'family_filter=off')
744 webpage = self._download_webpage(request, video_id)
746 # Extract URL, uploader and title from webpage
747 self.report_extraction(video_id)
748 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
750 self._downloader.trouble(u'ERROR: unable to extract media URL')
752 flashvars = compat_urllib_parse.unquote(mobj.group(1))
754 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
757 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
760 self._downloader.trouble(u'ERROR: unable to extract video URL')
763 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
765 self._downloader.trouble(u'ERROR: unable to extract video URL')
768 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
770 # TODO: support choosing qualities
772 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
774 self._downloader.trouble(u'ERROR: unable to extract title')
776 video_title = unescapeHTML(mobj.group('title'))
778 video_uploader = None
779 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
781 # lookin for official user
782 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783 if mobj_official is None:
784 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
786 video_uploader = mobj_official.group(1)
788 video_uploader = mobj.group(1)
790 video_upload_date = None
791 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
793 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798 'uploader': video_uploader,
799 'upload_date': video_upload_date,
800 'title': video_title,
801 'ext': video_extension,
805 class PhotobucketIE(InfoExtractor):
806 """Information extractor for photobucket.com."""
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(1)
831 video_extension = 'flv'
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
836 self.report_download_webpage(video_id)
837 webpage = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract title')
856 video_title = mobj.group(1).decode('utf-8')
858 video_uploader = mobj.group(2).decode('utf-8')
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
870 class YahooIE(InfoExtractor):
871 """Information extractor for video.yahoo.com."""
874 # _VALID_URL matches all Yahoo! Video URLs
875 # _VPAGE_URL matches only the extractable '/watch/' URLs
876 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878 IE_NAME = u'video.yahoo'
880 def __init__(self, downloader=None):
881 InfoExtractor.__init__(self, downloader)
883 def report_download_webpage(self, video_id):
884 """Report webpage download."""
885 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
887 def report_extraction(self, video_id):
888 """Report information extraction."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
891 def _real_extract(self, url, new_video=True):
892 # Extract ID from URL
893 mobj = re.match(self._VALID_URL, url)
895 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
898 video_id = mobj.group(2)
899 video_extension = 'flv'
901 # Rewrite valid but non-extractable URLs as
902 # extractable English language /watch/ URLs
903 if re.match(self._VPAGE_URL, url) is None:
904 request = compat_urllib_request.Request(url)
906 webpage = compat_urllib_request.urlopen(request).read()
907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
911 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
913 self._downloader.trouble(u'ERROR: Unable to extract id field')
915 yahoo_id = mobj.group(1)
917 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
919 self._downloader.trouble(u'ERROR: Unable to extract vid field')
921 yahoo_vid = mobj.group(1)
923 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924 return self._real_extract(url, new_video=False)
926 # Retrieve video webpage to extract further information
927 request = compat_urllib_request.Request(url)
929 self.report_download_webpage(video_id)
930 webpage = compat_urllib_request.urlopen(request).read()
931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
935 # Extract uploader and title from webpage
936 self.report_extraction(video_id)
937 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video title')
941 video_title = mobj.group(1).decode('utf-8')
943 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
945 self._downloader.trouble(u'ERROR: unable to extract video uploader')
947 video_uploader = mobj.group(1).decode('utf-8')
949 # Extract video thumbnail
950 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
952 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
954 video_thumbnail = mobj.group(1).decode('utf-8')
956 # Extract video description
957 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
959 self._downloader.trouble(u'ERROR: unable to extract video description')
961 video_description = mobj.group(1).decode('utf-8')
962 if not video_description:
963 video_description = 'No description available.'
965 # Extract video height and width
966 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video height')
970 yv_video_height = mobj.group(1)
972 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video width')
976 yv_video_width = mobj.group(1)
978 # Retrieve video playlist to extract media URL
979 # I'm not completely sure what all these options are, but we
980 # seem to need most of them, otherwise the server sends a 401.
981 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
982 yv_bitrate = '700' # according to Wikipedia this is hard-coded
983 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract media URL from playlist XML
994 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
996 self._downloader.trouble(u'ERROR: Unable to extract media URL')
998 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999 video_url = unescapeHTML(video_url)
1002 'id': video_id.decode('utf-8'),
1004 'uploader': video_uploader,
1005 'upload_date': None,
1006 'title': video_title,
1007 'ext': video_extension.decode('utf-8'),
1008 'thumbnail': video_thumbnail.decode('utf-8'),
1009 'description': video_description,
1013 class VimeoIE(InfoExtractor):
1014 """Information extractor for vimeo.com."""
1016 # _VALID_URL matches Vimeo URLs
1017 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1020 def __init__(self, downloader=None):
1021 InfoExtractor.__init__(self, downloader)
1023 def report_download_webpage(self, video_id):
1024 """Report webpage download."""
1025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1027 def report_extraction(self, video_id):
1028 """Report information extraction."""
1029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1038 video_id = mobj.group('id')
1039 if not mobj.group('proto'):
1040 url = 'https://' + url
1041 if mobj.group('direct_link'):
1042 url = 'https://vimeo.com/' + video_id
1044 # Retrieve video webpage to extract further information
1045 request = compat_urllib_request.Request(url, None, std_headers)
1047 self.report_download_webpage(video_id)
1048 webpage_bytes = compat_urllib_request.urlopen(request).read()
1049 webpage = webpage_bytes.decode('utf-8')
1050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1054 # Now we begin extracting as much information as we can from what we
1055 # retrieved. First we extract the information common to all extractors,
1056 # and latter we extract those that are Vimeo specific.
1057 self.report_extraction(video_id)
1059 # Extract the config JSON
1061 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062 config = json.loads(config)
1064 self._downloader.trouble(u'ERROR: unable to extract info section')
1068 video_title = config["video"]["title"]
1070 # Extract uploader and uploader_id
1071 video_uploader = config["video"]["owner"]["name"]
1072 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074 # Extract video thumbnail
1075 video_thumbnail = config["video"]["thumbnail"]
1077 # Extract video description
1078 video_description = get_element_by_attribute("itemprop", "description", webpage)
1079 if video_description: video_description = clean_html(video_description)
1080 else: video_description = ''
1082 # Extract upload date
1083 video_upload_date = None
1084 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085 if mobj is not None:
1086 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088 # Vimeo specific: extract request signature and timestamp
1089 sig = config['request']['signature']
1090 timestamp = config['request']['timestamp']
1092 # Vimeo specific: extract video codec and quality information
1093 # First consider quality, then codecs, then take everything
1094 # TODO bind to format param
1095 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096 files = { 'hd': [], 'sd': [], 'other': []}
1097 for codec_name, codec_extension in codecs:
1098 if codec_name in config["video"]["files"]:
1099 if 'hd' in config["video"]["files"][codec_name]:
1100 files['hd'].append((codec_name, codec_extension, 'hd'))
1101 elif 'sd' in config["video"]["files"][codec_name]:
1102 files['sd'].append((codec_name, codec_extension, 'sd'))
1104 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106 for quality in ('hd', 'sd', 'other'):
1107 if len(files[quality]) > 0:
1108 video_quality = files[quality][0][2]
1109 video_codec = files[quality][0][0]
1110 video_extension = files[quality][0][1]
1111 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1114 self._downloader.trouble(u'ERROR: no known codec found')
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1133 class ArteTvIE(InfoExtractor):
1134 """arte.tv information extractor."""
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1139 IE_NAME = u'arte.tv'
1141 def __init__(self, downloader=None):
1142 InfoExtractor.__init__(self, downloader)
1144 def report_download_webpage(self, video_id):
1145 """Report webpage download."""
1146 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1152 def fetch_webpage(self, url):
1153 request = compat_urllib_request.Request(url)
1155 self.report_download_webpage(url)
1156 webpage = compat_urllib_request.urlopen(request).read()
1157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1160 except ValueError as err:
1161 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166 page = self.fetch_webpage(url)
1167 mobj = re.search(regex, page, regexFlags)
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174 for (i, key, err) in matchTuples:
1175 if mobj.group(i) is None:
1176 self._downloader.trouble(err)
1179 info[key] = mobj.group(i)
1183 def extractLiveStream(self, url):
1184 video_lang = url.split('/')[-4]
1185 info = self.grep_webpage(
1187 r'src="(.*?/videothek_js.*?\.js)',
1190 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1193 http_host = url.split('/')[2]
1194 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195 info = self.grep_webpage(
1197 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198 '(http://.*?\.swf).*?' +
1202 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1203 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1207 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1209 def extractPlus7Stream(self, url):
1210 video_lang = url.split('/')[-3]
1211 info = self.grep_webpage(
1213 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1216 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1219 next_url = compat_urllib_parse.unquote(info.get('url'))
1220 info = self.grep_webpage(
1222 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1225 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1228 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1232 r'<video id="(.*?)".*?>.*?' +
1233 '<name>(.*?)</name>.*?' +
1234 '<dateVideo>(.*?)</dateVideo>.*?' +
1235 '<url quality="hd">(.*?)</url>',
1238 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1239 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1241 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1246 'id': info.get('id'),
1247 'url': compat_urllib_parse.unquote(info.get('url')),
1248 'uploader': u'arte.tv',
1249 'upload_date': info.get('date'),
1250 'title': info.get('title').decode('utf-8'),
1256 def _real_extract(self, url):
1257 video_id = url.split('/')[-1]
1258 self.report_extraction(video_id)
1260 if re.search(self._LIVE_URL, video_id) is not None:
1261 self.extractLiveStream(url)
1264 info = self.extractPlus7Stream(url)
1269 class GenericIE(InfoExtractor):
1270 """Generic last-resort information extractor."""
1273 IE_NAME = u'generic'
1275 def __init__(self, downloader=None):
1276 InfoExtractor.__init__(self, downloader)
1278 def report_download_webpage(self, video_id):
1279 """Report webpage download."""
1280 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1283 def report_extraction(self, video_id):
1284 """Report information extraction."""
1285 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1287 def report_following_redirect(self, new_url):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1291 def _test_redirect(self, url):
1292 """Check if it is a redirect, like url shorteners, in case restart chain."""
1293 class HeadRequest(compat_urllib_request.Request):
1294 def get_method(self):
1297 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1299 Subclass the HTTPRedirectHandler to make it use our
1300 HeadRequest also on the redirected URL
1302 def redirect_request(self, req, fp, code, msg, headers, newurl):
1303 if code in (301, 302, 303, 307):
1304 newurl = newurl.replace(' ', '%20')
1305 newheaders = dict((k,v) for k,v in req.headers.items()
1306 if k.lower() not in ("content-length", "content-type"))
1307 return HeadRequest(newurl,
1309 origin_req_host=req.get_origin_req_host(),
1312 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1314 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1316 Fallback to GET if HEAD is not allowed (405 HTTP error)
1318 def http_error_405(self, req, fp, code, msg, headers):
1322 newheaders = dict((k,v) for k,v in req.headers.items()
1323 if k.lower() not in ("content-length", "content-type"))
1324 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1326 origin_req_host=req.get_origin_req_host(),
1330 opener = compat_urllib_request.OpenerDirector()
1331 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332 HTTPMethodFallback, HEADRedirectHandler,
1333 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334 opener.add_handler(handler())
1336 response = opener.open(HeadRequest(url))
1337 new_url = response.geturl()
1342 self.report_following_redirect(new_url)
1343 self._downloader.download([new_url])
1346 def _real_extract(self, url):
1347 if self._test_redirect(url): return
1349 video_id = url.split('/')[-1]
1350 request = compat_urllib_request.Request(url)
1352 self.report_download_webpage(video_id)
1353 webpage = compat_urllib_request.urlopen(request).read()
1354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 # Broaden the search a little bit: JWPlayer JS loader
1371 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376 # It's possible that one of the regexes
1377 # matched, but returned an empty group:
1378 if mobj.group(1) is None:
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 video_url = compat_urllib_parse.unquote(mobj.group(1))
1383 video_id = os.path.basename(video_url)
1385 # here's a fun little line of code for you:
1386 video_extension = os.path.splitext(video_id)[1][1:]
1387 video_id = os.path.splitext(video_id)[0]
1389 # it's tempting to parse this further, but you would
1390 # have to take into account all the variations like
1391 # Video Title - Site Name
1392 # Site Name | Video Title
1393 # Video Title - Tagline | Site Name
1394 # and so on and so forth; it's just not practical
1395 mobj = re.search(r'<title>(.*)</title>', webpage)
1397 self._downloader.trouble(u'ERROR: unable to extract title')
1399 video_title = mobj.group(1)
1401 # video uploader is domain name
1402 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(InfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 _max_youtube_results = 1000
1423 IE_NAME = u'youtube:search'
1425 def __init__(self, downloader=None):
1426 InfoExtractor.__init__(self, downloader)
1428 def report_download_page(self, query, pagenum):
1429 """Report attempt to download search page with given number."""
1430 query = query.decode(preferredencoding())
1431 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1433 def _real_extract(self, query):
1434 mobj = re.match(self._VALID_URL, query)
1436 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1439 prefix, query = query.split(':')
1441 query = query.encode('utf-8')
1443 self._download_n_results(query, 1)
1445 elif prefix == 'all':
1446 self._download_n_results(query, self._max_youtube_results)
1452 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454 elif n > self._max_youtube_results:
1455 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456 n = self._max_youtube_results
1457 self._download_n_results(query, n)
1459 except ValueError: # parsing prefix as integer fails
1460 self._download_n_results(query, 1)
1463 def _download_n_results(self, query, n):
1464 """Downloads a specified number of results for a query"""
1470 while (50 * pagenum) < limit:
1471 self.report_download_page(query, pagenum+1)
1472 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473 request = compat_urllib_request.Request(result_url)
1475 data = compat_urllib_request.urlopen(request).read()
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1479 api_response = json.loads(data)['data']
1481 new_ids = list(video['id'] for video in api_response['items'])
1482 video_ids += new_ids
1484 limit = min(n, api_response['totalItems'])
1487 if len(video_ids) > n:
1488 video_ids = video_ids[:n]
1489 for id in video_ids:
1490 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1494 class GoogleSearchIE(InfoExtractor):
1495 """Information Extractor for Google Video search queries."""
1496 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1497 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1498 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1499 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1500 _max_google_results = 1000
1501 IE_NAME = u'video.google:search'
1503 def __init__(self, downloader=None):
1504 InfoExtractor.__init__(self, downloader)
1506 def report_download_page(self, query, pagenum):
1507 """Report attempt to download playlist page with given number."""
1508 query = query.decode(preferredencoding())
1509 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1511 def _real_extract(self, query):
1512 mobj = re.match(self._VALID_URL, query)
1514 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1517 prefix, query = query.split(':')
1519 query = query.encode('utf-8')
1521 self._download_n_results(query, 1)
1523 elif prefix == 'all':
1524 self._download_n_results(query, self._max_google_results)
1530 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1532 elif n > self._max_google_results:
1533 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534 n = self._max_google_results
1535 self._download_n_results(query, n)
1537 except ValueError: # parsing prefix as integer fails
1538 self._download_n_results(query, 1)
1541 def _download_n_results(self, query, n):
1542 """Downloads a specified number of results for a query"""
1548 self.report_download_page(query, pagenum)
1549 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550 request = compat_urllib_request.Request(result_url)
1552 page = compat_urllib_request.urlopen(request).read()
1553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1557 # Extract video identifiers
1558 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559 video_id = mobj.group(1)
1560 if video_id not in video_ids:
1561 video_ids.append(video_id)
1562 if len(video_ids) == n:
1563 # Specified n videos reached
1564 for id in video_ids:
1565 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1568 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569 for id in video_ids:
1570 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1573 pagenum = pagenum + 1
1576 class YahooSearchIE(InfoExtractor):
1577 """Information Extractor for Yahoo! Video search queries."""
1580 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583 _MORE_PAGES_INDICATOR = r'\s*Next'
1584 _max_yahoo_results = 1000
1585 IE_NAME = u'video.yahoo:search'
1587 def __init__(self, downloader=None):
1588 InfoExtractor.__init__(self, downloader)
1590 def report_download_page(self, query, pagenum):
1591 """Report attempt to download playlist page with given number."""
1592 query = query.decode(preferredencoding())
1593 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1595 def _real_extract(self, query):
1596 mobj = re.match(self._VALID_URL, query)
1598 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1601 prefix, query = query.split(':')
1603 query = query.encode('utf-8')
1605 self._download_n_results(query, 1)
1607 elif prefix == 'all':
1608 self._download_n_results(query, self._max_yahoo_results)
1614 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1616 elif n > self._max_yahoo_results:
1617 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1618 n = self._max_yahoo_results
1619 self._download_n_results(query, n)
1621 except ValueError: # parsing prefix as integer fails
1622 self._download_n_results(query, 1)
1625 def _download_n_results(self, query, n):
1626 """Downloads a specified number of results for a query"""
1629 already_seen = set()
1633 self.report_download_page(query, pagenum)
1634 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1635 request = compat_urllib_request.Request(result_url)
1637 page = compat_urllib_request.urlopen(request).read()
1638 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1639 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1642 # Extract video identifiers
1643 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1644 video_id = mobj.group(1)
1645 if video_id not in already_seen:
1646 video_ids.append(video_id)
1647 already_seen.add(video_id)
1648 if len(video_ids) == n:
1649 # Specified n videos reached
1650 for id in video_ids:
1651 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1655 for id in video_ids:
1656 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659 pagenum = pagenum + 1
1662 class YoutubePlaylistIE(InfoExtractor):
1663 """Information Extractor for YouTube playlists."""
1665 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1666 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1667 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1668 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669 IE_NAME = u'youtube:playlist'
1671 def __init__(self, downloader=None):
1672 InfoExtractor.__init__(self, downloader)
1674 def report_download_page(self, playlist_id, pagenum):
1675 """Report attempt to download playlist page with given number."""
1676 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1678 def _real_extract(self, url):
1679 # Extract playlist id
1680 mobj = re.match(self._VALID_URL, url)
1682 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1686 if mobj.group(3) is not None:
1687 self._downloader.download([mobj.group(3)])
1690 # Download playlist pages
1691 # prefix is 'p' as default for playlists but there are other types that need extra care
1692 playlist_prefix = mobj.group(1)
1693 if playlist_prefix == 'a':
1694 playlist_access = 'artist'
1696 playlist_prefix = 'p'
1697 playlist_access = 'view_play_list'
1698 playlist_id = mobj.group(2)
1703 self.report_download_page(playlist_id, pagenum)
1704 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1705 request = compat_urllib_request.Request(url)
1707 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1708 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1709 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1712 # Extract video identifiers
1714 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1715 if mobj.group(1) not in ids_in_page:
1716 ids_in_page.append(mobj.group(1))
1717 video_ids.extend(ids_in_page)
1719 if self._MORE_PAGES_INDICATOR not in page:
1721 pagenum = pagenum + 1
1723 total = len(video_ids)
1725 playliststart = self._downloader.params.get('playliststart', 1) - 1
1726 playlistend = self._downloader.params.get('playlistend', -1)
1727 if playlistend == -1:
1728 video_ids = video_ids[playliststart:]
1730 video_ids = video_ids[playliststart:playlistend]
1732 if len(video_ids) == total:
1733 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1735 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1737 for id in video_ids:
1738 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1742 class YoutubeChannelIE(InfoExtractor):
1743 """Information Extractor for YouTube channels."""
1745 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1748 IE_NAME = u'youtube:channel'
1750 def report_download_page(self, channel_id, pagenum):
1751 """Report attempt to download channel page with given number."""
1752 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1754 def _real_extract(self, url):
1755 # Extract channel id
1756 mobj = re.match(self._VALID_URL, url)
1758 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1761 # Download channel pages
1762 channel_id = mobj.group(1)
1767 self.report_download_page(channel_id, pagenum)
1768 url = self._TEMPLATE_URL % (channel_id, pagenum)
1769 request = compat_urllib_request.Request(url)
1771 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1772 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776 # Extract video identifiers
1778 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779 if mobj.group(1) not in ids_in_page:
1780 ids_in_page.append(mobj.group(1))
1781 video_ids.extend(ids_in_page)
1783 if self._MORE_PAGES_INDICATOR not in page:
1785 pagenum = pagenum + 1
1787 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1789 for id in video_ids:
1790 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1794 class YoutubeUserIE(InfoExtractor):
1795 """Information Extractor for YouTube users."""
1797 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1798 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1799 _GDATA_PAGE_SIZE = 50
1800 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1801 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1802 IE_NAME = u'youtube:user'
1804 def __init__(self, downloader=None):
1805 InfoExtractor.__init__(self, downloader)
1807 def report_download_page(self, username, start_index):
1808 """Report attempt to download user page."""
1809 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1810 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1812 def _real_extract(self, url):
1814 mobj = re.match(self._VALID_URL, url)
1816 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1819 username = mobj.group(1)
1821 # Download video ids using YouTube Data API. Result size per
1822 # query is limited (currently to 50 videos) so we need to query
1823 # page by page until there are no video ids - it means we got
1830 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1831 self.report_download_page(username, start_index)
1833 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1836 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1841 # Extract video identifiers
1844 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845 if mobj.group(1) not in ids_in_page:
1846 ids_in_page.append(mobj.group(1))
1848 video_ids.extend(ids_in_page)
1850 # A little optimization - if current page is not
1851 # "full", ie. does not contain PAGE_SIZE video ids then
1852 # we can assume that this page is the last one - there
1853 # are no more ids on further pages - no need to query
1856 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1861 all_ids_count = len(video_ids)
1862 playliststart = self._downloader.params.get('playliststart', 1) - 1
1863 playlistend = self._downloader.params.get('playlistend', -1)
1865 if playlistend == -1:
1866 video_ids = video_ids[playliststart:]
1868 video_ids = video_ids[playliststart:playlistend]
1870 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1871 (username, all_ids_count, len(video_ids)))
1873 for video_id in video_ids:
1874 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1877 class BlipTVUserIE(InfoExtractor):
1878 """Information Extractor for blip.tv users."""
1880 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1882 IE_NAME = u'blip.tv:user'
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1887 def report_download_page(self, username, pagenum):
1888 """Report attempt to download user page."""
1889 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1890 (self.IE_NAME, username, pagenum))
1892 def _real_extract(self, url):
1894 mobj = re.match(self._VALID_URL, url)
1896 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1899 username = mobj.group(1)
1901 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1903 request = compat_urllib_request.Request(url)
1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907 mobj = re.search(r'data-users-id="([^"]+)"', page)
1908 page_base = page_base % mobj.group(1)
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1914 # Download video ids using BlipTV Ajax calls. Result size per
1915 # query is limited (currently to 12 videos) so we need to query
1916 # page by page until there are no video ids - it means we got
1923 self.report_download_page(username, pagenum)
1925 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1928 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1933 # Extract video identifiers
1936 for mobj in re.finditer(r'href="/([^"]+)"', page):
1937 if mobj.group(1) not in ids_in_page:
1938 ids_in_page.append(unescapeHTML(mobj.group(1)))
1940 video_ids.extend(ids_in_page)
1942 # A little optimization - if current page is not
1943 # "full", ie. does not contain PAGE_SIZE video ids then
1944 # we can assume that this page is the last one - there
1945 # are no more ids on further pages - no need to query
1948 if len(ids_in_page) < self._PAGE_SIZE:
1953 all_ids_count = len(video_ids)
1954 playliststart = self._downloader.params.get('playliststart', 1) - 1
1955 playlistend = self._downloader.params.get('playlistend', -1)
1957 if playlistend == -1:
1958 video_ids = video_ids[playliststart:]
1960 video_ids = video_ids[playliststart:playlistend]
1962 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1963 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1965 for video_id in video_ids:
1966 self._downloader.download([u'http://blip.tv/'+video_id])
1969 class DepositFilesIE(InfoExtractor):
1970 """Information extractor for depositfiles.com"""
1972 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1974 def report_download_webpage(self, file_id):
1975 """Report webpage download."""
1976 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1978 def report_extraction(self, file_id):
1979 """Report information extraction."""
1980 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1982 def _real_extract(self, url):
1983 file_id = url.split('/')[-1]
1984 # Rebuild url in english locale
1985 url = 'http://depositfiles.com/en/files/' + file_id
1987 # Retrieve file webpage with 'Free download' button pressed
1988 free_download_indication = { 'gateway_result' : '1' }
1989 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1991 self.report_download_webpage(file_id)
1992 webpage = compat_urllib_request.urlopen(request).read()
1993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1997 # Search for the real file URL
1998 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1999 if (mobj is None) or (mobj.group(1) is None):
2000 # Try to figure out reason of the error.
2001 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2002 if (mobj is not None) and (mobj.group(1) is not None):
2003 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2004 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2006 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2009 file_url = mobj.group(1)
2010 file_extension = os.path.splitext(file_url)[1][1:]
2012 # Search for file title
2013 mobj = re.search(r'<b title="(.*?)">', webpage)
2015 self._downloader.trouble(u'ERROR: unable to extract title')
2017 file_title = mobj.group(1).decode('utf-8')
2020 'id': file_id.decode('utf-8'),
2021 'url': file_url.decode('utf-8'),
2023 'upload_date': None,
2024 'title': file_title,
2025 'ext': file_extension.decode('utf-8'),
2029 class FacebookIE(InfoExtractor):
2030 """Information Extractor for Facebook"""
2032 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2033 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2034 _NETRC_MACHINE = 'facebook'
2035 IE_NAME = u'facebook'
2037 def report_login(self):
2038 """Report attempt to log in."""
2039 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2041 def _real_initialize(self):
2042 if self._downloader is None:
2047 downloader_params = self._downloader.params
2049 # Attempt to use provided username and password or .netrc data
2050 if downloader_params.get('username', None) is not None:
2051 useremail = downloader_params['username']
2052 password = downloader_params['password']
2053 elif downloader_params.get('usenetrc', False):
2055 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2056 if info is not None:
2060 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2061 except (IOError, netrc.NetrcParseError) as err:
2062 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2065 if useremail is None:
2074 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2077 login_results = compat_urllib_request.urlopen(request).read()
2078 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2079 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2081 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2082 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2085 def _real_extract(self, url):
2086 mobj = re.match(self._VALID_URL, url)
2088 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2090 video_id = mobj.group('ID')
2092 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2093 webpage = self._download_webpage(url, video_id)
2095 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2096 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2097 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2099 raise ExtractorError(u'Cannot parse data')
2100 data = dict(json.loads(m.group(1)))
2101 params_raw = compat_urllib_parse.unquote(data['params'])
2102 params = json.loads(params_raw)
2103 video_url = params['hd_src']
2105 video_url = params['sd_src']
2107 raise ExtractorError(u'Cannot find video URL')
2108 video_duration = int(params['video_duration'])
2110 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2112 raise ExtractorError(u'Cannot find title in webpage')
2113 video_title = unescapeHTML(m.group(1))
2117 'title': video_title,
2120 'duration': video_duration,
2121 'thumbnail': params['thumbnail_src'],
2126 class BlipTVIE(InfoExtractor):
2127 """Information extractor for blip.tv"""
2129 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2130 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2131 IE_NAME = u'blip.tv'
2133 def report_extraction(self, file_id):
2134 """Report information extraction."""
2135 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2137 def report_direct_download(self, title):
2138 """Report information extraction."""
2139 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2141 def _real_extract(self, url):
2142 mobj = re.match(self._VALID_URL, url)
2144 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2151 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2152 request = compat_urllib_request.Request(json_url)
2153 request.add_header('User-Agent', 'iTunes/10.6.1')
2154 self.report_extraction(mobj.group(1))
2157 urlh = compat_urllib_request.urlopen(request)
2158 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2159 basename = url.split('/')[-1]
2160 title,ext = os.path.splitext(basename)
2161 title = title.decode('UTF-8')
2162 ext = ext.replace('.', '')
2163 self.report_direct_download(title)
2168 'upload_date': None,
2173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2174 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2175 if info is None: # Regular URL
2177 json_code_bytes = urlh.read()
2178 json_code = json_code_bytes.decode('utf-8')
2179 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2180 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2184 json_data = json.loads(json_code)
2185 if 'Post' in json_data:
2186 data = json_data['Post']
2190 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2191 video_url = data['media']['url']
2192 umobj = re.match(self._URL_EXT, video_url)
2194 raise ValueError('Can not determine filename extension')
2195 ext = umobj.group(1)
2198 'id': data['item_id'],
2200 'uploader': data['display_name'],
2201 'upload_date': upload_date,
2202 'title': data['title'],
2204 'format': data['media']['mimeType'],
2205 'thumbnail': data['thumbnailUrl'],
2206 'description': data['description'],
2207 'player_url': data['embedUrl'],
2208 'user_agent': 'iTunes/10.6.1',
2210 except (ValueError,KeyError) as err:
2211 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2217 class MyVideoIE(InfoExtractor):
2218 """Information Extractor for myvideo.de."""
2220 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2221 IE_NAME = u'myvideo'
2223 def __init__(self, downloader=None):
2224 InfoExtractor.__init__(self, downloader)
2226 def report_extraction(self, video_id):
2227 """Report information extraction."""
2228 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2230 def _real_extract(self,url):
2231 mobj = re.match(self._VALID_URL, url)
2233 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2236 video_id = mobj.group(1)
2239 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2240 webpage = self._download_webpage(webpage_url, video_id)
2242 self.report_extraction(video_id)
2243 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2246 self._downloader.trouble(u'ERROR: unable to extract media URL')
2248 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2250 mobj = re.search('<title>([^<]+)</title>', webpage)
2252 self._downloader.trouble(u'ERROR: unable to extract title')
2255 video_title = mobj.group(1)
2261 'upload_date': None,
2262 'title': video_title,
2266 class ComedyCentralIE(InfoExtractor):
2267 """Information extractor for The Daily Show and Colbert Report """
2269 # urls can be abbreviations like :thedailyshow or :colbert
2270 # urls for episodes like:
2271 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2272 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2273 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2274 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2275 |(https?://)?(www\.)?
2276 (?P<showname>thedailyshow|colbertnation)\.com/
2277 (full-episodes/(?P<episode>.*)|
2279 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2280 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2283 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2285 _video_extensions = {
2293 _video_dimensions = {
2302 def suitable(self, url):
2303 """Receives a URL and returns True if suitable for this IE."""
2304 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2306 def report_extraction(self, episode_id):
2307 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2309 def report_config_download(self, episode_id, media_id):
2310 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2312 def report_index_download(self, episode_id):
2313 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2315 def _print_formats(self, formats):
2316 print('Available formats:')
2318 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2321 def _real_extract(self, url):
2322 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2324 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2327 if mobj.group('shortname'):
2328 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2329 url = u'http://www.thedailyshow.com/full-episodes/'
2331 url = u'http://www.colbertnation.com/full-episodes/'
2332 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2333 assert mobj is not None
2335 if mobj.group('clip'):
2336 if mobj.group('showname') == 'thedailyshow':
2337 epTitle = mobj.group('tdstitle')
2339 epTitle = mobj.group('cntitle')
2342 dlNewest = not mobj.group('episode')
2344 epTitle = mobj.group('showname')
2346 epTitle = mobj.group('episode')
2348 req = compat_urllib_request.Request(url)
2349 self.report_extraction(epTitle)
2351 htmlHandle = compat_urllib_request.urlopen(req)
2352 html = htmlHandle.read()
2353 webpage = html.decode('utf-8')
2354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2358 url = htmlHandle.geturl()
2359 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2361 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2363 if mobj.group('episode') == '':
2364 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2366 epTitle = mobj.group('episode')
2368 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2370 if len(mMovieParams) == 0:
2371 # The Colbert Report embeds the information in a without
2372 # a URL prefix; so extract the alternate reference
2373 # and then add the URL prefix manually.
2375 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2376 if len(altMovieParams) == 0:
2377 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2380 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2382 uri = mMovieParams[0][1]
2383 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2384 self.report_index_download(epTitle)
2386 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2388 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2393 idoc = xml.etree.ElementTree.fromstring(indexXml)
2394 itemEls = idoc.findall('.//item')
2395 for partNum,itemEl in enumerate(itemEls):
2396 mediaId = itemEl.findall('./guid')[0].text
2397 shortMediaId = mediaId.split(':')[-1]
2398 showId = mediaId.split(':')[-2].replace('.com', '')
2399 officialTitle = itemEl.findall('./title')[0].text
2400 officialDate = itemEl.findall('./pubDate')[0].text
2402 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2403 compat_urllib_parse.urlencode({'uri': mediaId}))
2404 configReq = compat_urllib_request.Request(configUrl)
2405 self.report_config_download(epTitle, shortMediaId)
2407 configXml = compat_urllib_request.urlopen(configReq).read()
2408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2409 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2412 cdoc = xml.etree.ElementTree.fromstring(configXml)
2414 for rendition in cdoc.findall('.//rendition'):
2415 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2419 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2422 if self._downloader.params.get('listformats', None):
2423 self._print_formats([i[0] for i in turls])
2426 # For now, just pick the highest bitrate
2427 format,rtmp_video_url = turls[-1]
2429 # Get the format arg from the arg stream
2430 req_format = self._downloader.params.get('format', None)
2432 # Select format if we can find one
2435 format, rtmp_video_url = f, v
2438 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2440 raise ExtractorError(u'Cannot transform RTMP url')
2441 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2442 video_url = base + m.group('finalid')
2444 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2449 'upload_date': officialDate,
2454 'description': officialTitle,
2456 results.append(info)
2461 class EscapistIE(InfoExtractor):
2462 """Information extractor for The Escapist """
2464 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2465 IE_NAME = u'escapist'
2467 def report_extraction(self, showName):
2468 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2470 def report_config_download(self, showName):
2471 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2473 def _real_extract(self, url):
2474 mobj = re.match(self._VALID_URL, url)
2476 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2478 showName = mobj.group('showname')
2479 videoId = mobj.group('episode')
2481 self.report_extraction(showName)
2483 webPage = compat_urllib_request.urlopen(url)
2484 webPageBytes = webPage.read()
2485 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2486 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2488 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2491 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2492 description = unescapeHTML(descMatch.group(1))
2493 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2494 imgUrl = unescapeHTML(imgMatch.group(1))
2495 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2496 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2497 configUrlMatch = re.search('config=(.*)$', playerUrl)
2498 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2500 self.report_config_download(showName)
2502 configJSON = compat_urllib_request.urlopen(configUrl)
2503 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2504 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2506 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2509 # Technically, it's JavaScript, not JSON
2510 configJSON = configJSON.replace("'", '"')
2513 config = json.loads(configJSON)
2514 except (ValueError,) as err:
2515 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2518 playlist = config['playlist']
2519 videoUrl = playlist[1]['url']
2524 'uploader': showName,
2525 'upload_date': None,
2528 'thumbnail': imgUrl,
2529 'description': description,
2530 'player_url': playerUrl,
2535 class CollegeHumorIE(InfoExtractor):
2536 """Information extractor for collegehumor.com"""
2539 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2540 IE_NAME = u'collegehumor'
2542 def report_manifest(self, video_id):
2543 """Report information extraction."""
2544 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2546 def report_extraction(self, video_id):
2547 """Report information extraction."""
2548 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2550 def _real_extract(self, url):
2551 mobj = re.match(self._VALID_URL, url)
2553 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2555 video_id = mobj.group('videoid')
2560 'upload_date': None,
2563 self.report_extraction(video_id)
2564 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2566 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2571 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2573 videoNode = mdoc.findall('./video')[0]
2574 info['description'] = videoNode.findall('./description')[0].text
2575 info['title'] = videoNode.findall('./caption')[0].text
2576 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2577 manifest_url = videoNode.findall('./file')[0].text
2579 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2582 manifest_url += '?hdcore=2.10.3'
2583 self.report_manifest(video_id)
2585 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2586 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2587 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2590 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2592 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2593 node_id = media_node.attrib['url']
2594 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2595 except IndexError as err:
2596 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2599 url_pr = compat_urllib_parse_urlparse(manifest_url)
2600 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2607 class XVideosIE(InfoExtractor):
2608 """Information extractor for xvideos.com"""
2610 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2611 IE_NAME = u'xvideos'
2613 def report_extraction(self, video_id):
2614 """Report information extraction."""
2615 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2617 def _real_extract(self, url):
2618 mobj = re.match(self._VALID_URL, url)
2620 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2622 video_id = mobj.group(1)
2624 webpage = self._download_webpage(url, video_id)
2626 self.report_extraction(video_id)
2630 mobj = re.search(r'flv_url=(.+?)&', webpage)
2632 self._downloader.trouble(u'ERROR: unable to extract video url')
2634 video_url = compat_urllib_parse.unquote(mobj.group(1))
2638 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2640 self._downloader.trouble(u'ERROR: unable to extract video title')
2642 video_title = mobj.group(1)
2645 # Extract video thumbnail
2646 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2648 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2650 video_thumbnail = mobj.group(0)
2656 'upload_date': None,
2657 'title': video_title,
2659 'thumbnail': video_thumbnail,
2660 'description': None,
2666 class SoundcloudIE(InfoExtractor):
2667 """Information extractor for soundcloud.com
2668 To access the media, the uid of the song and a stream token
2669 must be extracted from the page source and the script must make
2670 a request to media.soundcloud.com/crossdomain.xml. Then
2671 the media can be grabbed by requesting from an url composed
2672 of the stream token and uid
2675 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2676 IE_NAME = u'soundcloud'
2678 def __init__(self, downloader=None):
2679 InfoExtractor.__init__(self, downloader)
2681 def report_resolve(self, video_id):
2682 """Report information extraction."""
2683 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2685 def report_extraction(self, video_id):
2686 """Report information extraction."""
2687 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2689 def _real_extract(self, url):
2690 mobj = re.match(self._VALID_URL, url)
2692 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2695 # extract uploader (which is in the url)
2696 uploader = mobj.group(1)
2697 # extract simple title (uploader + slug of song title)
2698 slug_title = mobj.group(2)
2699 simple_title = uploader + u'-' + slug_title
2701 self.report_resolve('%s/%s' % (uploader, slug_title))
2703 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2704 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2705 request = compat_urllib_request.Request(resolv_url)
2707 info_json_bytes = compat_urllib_request.urlopen(request).read()
2708 info_json = info_json_bytes.decode('utf-8')
2709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2713 info = json.loads(info_json)
2714 video_id = info['id']
2715 self.report_extraction('%s/%s' % (uploader, slug_title))
2717 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2718 request = compat_urllib_request.Request(streams_url)
2720 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2721 stream_json = stream_json_bytes.decode('utf-8')
2722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2726 streams = json.loads(stream_json)
2727 mediaURL = streams['http_mp3_128_url']
2732 'uploader': info['user']['username'],
2733 'upload_date': info['created_at'],
2734 'title': info['title'],
2736 'description': info['description'],
2740 class InfoQIE(InfoExtractor):
2741 """Information extractor for infoq.com"""
2742 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2744 def report_extraction(self, video_id):
2745 """Report information extraction."""
2746 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2748 def _real_extract(self, url):
2749 mobj = re.match(self._VALID_URL, url)
2751 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2754 webpage = self._download_webpage(url, video_id=url)
2755 self.report_extraction(url)
2758 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2760 self._downloader.trouble(u'ERROR: unable to extract video url')
2762 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2763 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2766 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2768 self._downloader.trouble(u'ERROR: unable to extract video title')
2770 video_title = mobj.group(1)
2772 # Extract description
2773 video_description = u'No description available.'
2774 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2775 if mobj is not None:
2776 video_description = mobj.group(1)
2778 video_filename = video_url.split('/')[-1]
2779 video_id, extension = video_filename.split('.')
2785 'upload_date': None,
2786 'title': video_title,
2787 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2789 'description': video_description,
2794 class MixcloudIE(InfoExtractor):
2795 """Information extractor for www.mixcloud.com"""
2797 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2798 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2799 IE_NAME = u'mixcloud'
2801 def __init__(self, downloader=None):
2802 InfoExtractor.__init__(self, downloader)
2804 def report_download_json(self, file_id):
2805 """Report JSON download."""
2806 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2808 def report_extraction(self, file_id):
2809 """Report information extraction."""
2810 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2812 def get_urls(self, jsonData, fmt, bitrate='best'):
2813 """Get urls from 'audio_formats' section in json"""
2816 bitrate_list = jsonData[fmt]
2817 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2818 bitrate = max(bitrate_list) # select highest
2820 url_list = jsonData[fmt][bitrate]
2821 except TypeError: # we have no bitrate info.
2822 url_list = jsonData[fmt]
2825 def check_urls(self, url_list):
2826 """Returns 1st active url from list"""
2827 for url in url_list:
2829 compat_urllib_request.urlopen(url)
2831 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2836 def _print_formats(self, formats):
2837 print('Available formats:')
2838 for fmt in formats.keys():
2839 for b in formats[fmt]:
2841 ext = formats[fmt][b][0]
2842 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2843 except TypeError: # we have no bitrate info
2844 ext = formats[fmt][0]
2845 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2848 def _real_extract(self, url):
2849 mobj = re.match(self._VALID_URL, url)
2851 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2853 # extract uploader & filename from url
2854 uploader = mobj.group(1).decode('utf-8')
2855 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2857 # construct API request
2858 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2859 # retrieve .json file with links to files
2860 request = compat_urllib_request.Request(file_url)
2862 self.report_download_json(file_url)
2863 jsonData = compat_urllib_request.urlopen(request).read()
2864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2869 json_data = json.loads(jsonData)
2870 player_url = json_data['player_swf_url']
2871 formats = dict(json_data['audio_formats'])
2873 req_format = self._downloader.params.get('format', None)
2876 if self._downloader.params.get('listformats', None):
2877 self._print_formats(formats)
2880 if req_format is None or req_format == 'best':
2881 for format_param in formats.keys():
2882 url_list = self.get_urls(formats, format_param)
2884 file_url = self.check_urls(url_list)
2885 if file_url is not None:
2888 if req_format not in formats:
2889 self._downloader.trouble(u'ERROR: format is not available')
2892 url_list = self.get_urls(formats, req_format)
2893 file_url = self.check_urls(url_list)
2894 format_param = req_format
2897 'id': file_id.decode('utf-8'),
2898 'url': file_url.decode('utf-8'),
2899 'uploader': uploader.decode('utf-8'),
2900 'upload_date': None,
2901 'title': json_data['name'],
2902 'ext': file_url.split('.')[-1].decode('utf-8'),
2903 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2904 'thumbnail': json_data['thumbnail_url'],
2905 'description': json_data['description'],
2906 'player_url': player_url.decode('utf-8'),
2909 class StanfordOpenClassroomIE(InfoExtractor):
2910 """Information extractor for Stanford's Open ClassRoom"""
2912 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2913 IE_NAME = u'stanfordoc'
2915 def report_download_webpage(self, objid):
2916 """Report information extraction."""
2917 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2919 def report_extraction(self, video_id):
2920 """Report information extraction."""
2921 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 raise ExtractorError(u'Invalid URL: %s' % url)
2928 if mobj.group('course') and mobj.group('video'): # A specific video
2929 course = mobj.group('course')
2930 video = mobj.group('video')
2932 'id': course + '_' + video,
2934 'upload_date': None,
2937 self.report_extraction(info['id'])
2938 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2939 xmlUrl = baseUrl + video + '.xml'
2941 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2942 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2943 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2945 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2947 info['title'] = mdoc.findall('./title')[0].text
2948 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2950 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2952 info['ext'] = info['url'].rpartition('.')[2]
2954 elif mobj.group('course'): # A course page
2955 course = mobj.group('course')
2960 'upload_date': None,
2963 coursepage = self._download_webpage(url, info['id'],
2964 note='Downloading course info page',
2965 errnote='Unable to download course info page')
2967 m = re.search('<h1>([^<]+)</h1>', coursepage)
2969 info['title'] = unescapeHTML(m.group(1))
2971 info['title'] = info['id']
2973 m = re.search('<description>([^<]+)</description>', coursepage)
2975 info['description'] = unescapeHTML(m.group(1))
2977 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2980 'type': 'reference',
2981 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2985 for entry in info['list']:
2986 assert entry['type'] == 'reference'
2987 results += self.extract(entry['url'])
2991 'id': 'Stanford OpenClassroom',
2994 'upload_date': None,
2997 self.report_download_webpage(info['id'])
2998 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3000 rootpage = compat_urllib_request.urlopen(rootURL).read()
3001 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3002 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3005 info['title'] = info['id']
3007 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3010 'type': 'reference',
3011 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3016 for entry in info['list']:
3017 assert entry['type'] == 'reference'
3018 results += self.extract(entry['url'])
3021 class MTVIE(InfoExtractor):
3022 """Information extractor for MTV.com"""
3024 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3027 def report_extraction(self, video_id):
3028 """Report information extraction."""
3029 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3031 def _real_extract(self, url):
3032 mobj = re.match(self._VALID_URL, url)
3034 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3036 if not mobj.group('proto'):
3037 url = 'http://' + url
3038 video_id = mobj.group('videoid')
3040 webpage = self._download_webpage(url, video_id)
3042 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3044 self._downloader.trouble(u'ERROR: unable to extract song name')
3046 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3047 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3049 self._downloader.trouble(u'ERROR: unable to extract performer')
3051 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3052 video_title = performer + ' - ' + song_name
3054 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3056 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3058 mtvn_uri = mobj.group(1)
3060 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3062 self._downloader.trouble(u'ERROR: unable to extract content id')
3064 content_id = mobj.group(1)
3066 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3067 self.report_extraction(video_id)
3068 request = compat_urllib_request.Request(videogen_url)
3070 metadataXml = compat_urllib_request.urlopen(request).read()
3071 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3072 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3075 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3076 renditions = mdoc.findall('.//rendition')
3078 # For now, always pick the highest quality.
3079 rendition = renditions[-1]
3082 _,_,ext = rendition.attrib['type'].partition('/')
3083 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3084 video_url = rendition.find('./src').text
3086 self._downloader.trouble('Invalid rendition field.')
3092 'uploader': performer,
3093 'upload_date': None,
3094 'title': video_title,
3102 class YoukuIE(InfoExtractor):
3103 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3105 def report_download_webpage(self, file_id):
3106 """Report webpage download."""
3107 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3109 def report_extraction(self, file_id):
3110 """Report information extraction."""
3111 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3114 nowTime = int(time.time() * 1000)
3115 random1 = random.randint(1000,1998)
3116 random2 = random.randint(1000,9999)
3118 return "%d%d%d" %(nowTime,random1,random2)
3120 def _get_file_ID_mix_string(self, seed):
3122 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3124 for i in range(len(source)):
3125 seed = (seed * 211 + 30031 ) % 65536
3126 index = math.floor(seed / 65536 * len(source) )
3127 mixed.append(source[int(index)])
3128 source.remove(source[int(index)])
3129 #return ''.join(mixed)
3132 def _get_file_id(self, fileId, seed):
3133 mixed = self._get_file_ID_mix_string(seed)
3134 ids = fileId.split('*')
3138 realId.append(mixed[int(ch)])
3139 return ''.join(realId)
3141 def _real_extract(self, url):
3142 mobj = re.match(self._VALID_URL, url)
3144 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3146 video_id = mobj.group('ID')
3148 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3150 request = compat_urllib_request.Request(info_url, None, std_headers)
3152 self.report_download_webpage(video_id)
3153 jsondata = compat_urllib_request.urlopen(request).read()
3154 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3155 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3158 self.report_extraction(video_id)
3160 jsonstr = jsondata.decode('utf-8')
3161 config = json.loads(jsonstr)
3163 video_title = config['data'][0]['title']
3164 seed = config['data'][0]['seed']
3166 format = self._downloader.params.get('format', None)
3167 supported_format = list(config['data'][0]['streamfileids'].keys())
3169 if format is None or format == 'best':
3170 if 'hd2' in supported_format:
3175 elif format == 'worst':
3183 fileid = config['data'][0]['streamfileids'][format]
3184 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3185 except (UnicodeDecodeError, ValueError, KeyError):
3186 self._downloader.trouble(u'ERROR: unable to extract info section')
3190 sid = self._gen_sid()
3191 fileid = self._get_file_id(fileid, seed)
3193 #column 8,9 of fileid represent the segment number
3194 #fileid[7:9] should be changed
3195 for index, key in enumerate(keys):
3197 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3198 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3201 'id': '%s_part%02d' % (video_id, index),
3202 'url': download_url,
3204 'upload_date': None,
3205 'title': video_title,
3208 files_info.append(info)
3213 class XNXXIE(InfoExtractor):
3214 """Information extractor for xnxx.com"""
3216 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3218 VIDEO_URL_RE = r'flv_url=(.*?)&'
3219 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3220 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3222 def report_webpage(self, video_id):
3223 """Report information extraction"""
3224 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3226 def report_extraction(self, video_id):
3227 """Report information extraction"""
3228 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3230 def _real_extract(self, url):
3231 mobj = re.match(self._VALID_URL, url)
3233 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3235 video_id = mobj.group(1)
3237 self.report_webpage(video_id)
3239 # Get webpage content
3241 webpage_bytes = compat_urllib_request.urlopen(url).read()
3242 webpage = webpage_bytes.decode('utf-8')
3243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3247 result = re.search(self.VIDEO_URL_RE, webpage)
3249 self._downloader.trouble(u'ERROR: unable to extract video url')
3251 video_url = compat_urllib_parse.unquote(result.group(1))
3253 result = re.search(self.VIDEO_TITLE_RE, webpage)
3255 self._downloader.trouble(u'ERROR: unable to extract video title')
3257 video_title = result.group(1)
3259 result = re.search(self.VIDEO_THUMB_RE, webpage)
3261 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3263 video_thumbnail = result.group(1)
3269 'upload_date': None,
3270 'title': video_title,
3272 'thumbnail': video_thumbnail,
3273 'description': None,
3277 class GooglePlusIE(InfoExtractor):
3278 """Information extractor for plus.google.com."""
3280 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3281 IE_NAME = u'plus.google'
3283 def __init__(self, downloader=None):
3284 InfoExtractor.__init__(self, downloader)
3286 def report_extract_entry(self, url):
3287 """Report downloading extry"""
3288 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3290 def report_date(self, upload_date):
3291 """Report downloading extry"""
3292 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3294 def report_uploader(self, uploader):
3295 """Report downloading extry"""
3296 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3298 def report_title(self, video_title):
3299 """Report downloading extry"""
3300 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3302 def report_extract_vid_page(self, video_page):
3303 """Report information extraction."""
3304 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3306 def _real_extract(self, url):
3307 # Extract id from URL
3308 mobj = re.match(self._VALID_URL, url)
3310 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3313 post_url = mobj.group(0)
3314 video_id = mobj.group(1)
3316 video_extension = 'flv'
3318 # Step 1, Retrieve post webpage to extract further information
3319 self.report_extract_entry(post_url)
3320 request = compat_urllib_request.Request(post_url)
3322 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3323 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3324 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3327 # Extract update date
3329 pattern = 'title="Timestamp">(.*?)</a>'
3330 mobj = re.search(pattern, webpage)
3332 upload_date = mobj.group(1)
3333 # Convert timestring to a format suitable for filename
3334 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3335 upload_date = upload_date.strftime('%Y%m%d')
3336 self.report_date(upload_date)
3340 pattern = r'rel\="author".*?>(.*?)</a>'
3341 mobj = re.search(pattern, webpage)
3343 uploader = mobj.group(1)
3344 self.report_uploader(uploader)
3347 # Get the first line for title
3349 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3350 mobj = re.search(pattern, webpage)
3352 video_title = mobj.group(1)
3353 self.report_title(video_title)
3355 # Step 2, Stimulate clicking the image box to launch video
3356 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3357 mobj = re.search(pattern, webpage)
3359 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3361 video_page = mobj.group(1)
3362 request = compat_urllib_request.Request(video_page)
3364 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3368 self.report_extract_vid_page(video_page)
3371 # Extract video links on video page
3372 """Extract video links of all sizes"""
3373 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3374 mobj = re.findall(pattern, webpage)
3376 self._downloader.trouble(u'ERROR: unable to extract video links')
3378 # Sort in resolution
3379 links = sorted(mobj)
3381 # Choose the lowest of the sort, i.e. highest resolution
3382 video_url = links[-1]
3383 # Only get the url. The resolution part in the tuple has no use anymore
3384 video_url = video_url[-1]
3385 # Treat escaped \u0026 style hex
3387 video_url = video_url.decode("unicode_escape")
3388 except AttributeError: # Python 3
3389 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3395 'uploader': uploader,
3396 'upload_date': upload_date,
3397 'title': video_title,
3398 'ext': video_extension,
3401 class NBAIE(InfoExtractor):
3402 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3405 def _real_extract(self, url):
3406 mobj = re.match(self._VALID_URL, url)
3408 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3411 video_id = mobj.group(1)
3412 if video_id.endswith('/index.html'):
3413 video_id = video_id[:-len('/index.html')]
3415 webpage = self._download_webpage(url, video_id)
3417 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3418 def _findProp(rexp, default=None):
3419 m = re.search(rexp, webpage)
3421 return unescapeHTML(m.group(1))
3425 shortened_video_id = video_id.rpartition('/')[2]
3426 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3428 'id': shortened_video_id,
3432 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3433 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3437 class JustinTVIE(InfoExtractor):
3438 """Information extractor for justin.tv and twitch.tv"""
3439 # TODO: One broadcast may be split into multiple videos. The key
3440 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3441 # starts at 1 and increases. Can we treat all parts as one video?
3443 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3444 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3445 _JUSTIN_PAGE_LIMIT = 100
3446 IE_NAME = u'justin.tv'
3448 def report_extraction(self, file_id):
3449 """Report information extraction."""
3450 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3452 def report_download_page(self, channel, offset):
3453 """Report attempt to download a single page of videos."""
3454 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3455 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3457 # Return count of items, list of *valid* items
3458 def _parse_page(self, url):
3460 urlh = compat_urllib_request.urlopen(url)
3461 webpage_bytes = urlh.read()
3462 webpage = webpage_bytes.decode('utf-8', 'ignore')
3463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3464 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3467 response = json.loads(webpage)
3468 if type(response) != list:
3469 error_text = response.get('error', 'unknown error')
3470 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3473 for clip in response:
3474 video_url = clip['video_file_url']
3476 video_extension = os.path.splitext(video_url)[1][1:]
3477 video_date = re.sub('-', '', clip['start_time'][:10])
3478 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3479 video_id = clip['id']
3480 video_title = clip.get('title', video_id)
3484 'title': video_title,
3485 'uploader': clip.get('channel_name', video_uploader_id),
3486 'uploader_id': video_uploader_id,
3487 'upload_date': video_date,
3488 'ext': video_extension,
3490 return (len(response), info)
3492 def _real_extract(self, url):
3493 mobj = re.match(self._VALID_URL, url)
3495 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3498 api = 'http://api.justin.tv'
3499 video_id = mobj.group(mobj.lastindex)
3501 if mobj.lastindex == 1:
3503 api += '/channel/archives/%s.json'
3505 api += '/broadcast/by_archive/%s.json'
3506 api = api % (video_id,)
3508 self.report_extraction(video_id)
3512 limit = self._JUSTIN_PAGE_LIMIT
3515 self.report_download_page(video_id, offset)
3516 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3517 page_count, page_info = self._parse_page(page_url)
3518 info.extend(page_info)
3519 if not paged or page_count != limit:
3524 class FunnyOrDieIE(InfoExtractor):
3525 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3527 def _real_extract(self, url):
3528 mobj = re.match(self._VALID_URL, url)
3530 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3533 video_id = mobj.group('id')
3534 webpage = self._download_webpage(url, video_id)
3536 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3538 self._downloader.trouble(u'ERROR: unable to find video information')
3539 video_url = unescapeHTML(m.group('url'))
3541 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3543 self._downloader.trouble(u'Cannot find video title')
3544 title = unescapeHTML(m.group('title'))
3546 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3548 desc = unescapeHTML(m.group('desc'))
3557 'description': desc,
3561 class TweetReelIE(InfoExtractor):
3562 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3564 def _real_extract(self, url):
3565 mobj = re.match(self._VALID_URL, url)
3567 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3570 video_id = mobj.group('id')
3571 webpage = self._download_webpage(url, video_id)
3573 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3575 self._downloader.trouble(u'ERROR: Cannot find status ID')
3576 status_id = m.group(1)
3578 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3580 self._downloader.trouble(u'WARNING: Cannot find description')
3581 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3583 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3585 self._downloader.trouble(u'ERROR: Cannot find uploader')
3586 uploader = unescapeHTML(m.group('uploader'))
3587 uploader_id = unescapeHTML(m.group('uploader_id'))
3589 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3591 self._downloader.trouble(u'ERROR: Cannot find upload date')
3592 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3595 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3602 'description': desc,
3603 'uploader': uploader,
3604 'uploader_id': uploader_id,
3605 'internal_id': status_id,
3606 'upload_date': upload_date
3610 class SteamIE(InfoExtractor):
3611 _VALID_URL = r"""http://store.steampowered.com/
3612 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3614 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3617 def suitable(self, url):
3618 """Receives a URL and returns True if suitable for this IE."""
3619 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3621 def _real_extract(self, url):
3622 m = re.match(self._VALID_URL, url, re.VERBOSE)
3623 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3624 gameID = m.group('gameID')
3625 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3626 webpage = self._download_webpage(videourl, gameID)
3627 mweb = re.finditer(urlRE, webpage)
3628 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3629 titles = re.finditer(namesRE, webpage)
3630 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3631 thumbs = re.finditer(thumbsRE, webpage)
3633 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3634 video_id = vid.group('videoID')
3635 title = vtitle.group('videoName')
3636 video_url = vid.group('videoURL')
3637 video_thumb = thumb.group('thumbnail')
3639 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3644 'title': unescapeHTML(title),
3645 'thumbnail': video_thumb
3650 class UstreamIE(InfoExtractor):
3651 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3652 IE_NAME = u'ustream'
3654 def _real_extract(self, url):
3655 m = re.match(self._VALID_URL, url)
3656 video_id = m.group('videoID')
3657 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3658 webpage = self._download_webpage(url, video_id)
3659 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3660 title = m.group('title')
3661 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3662 uploader = m.group('uploader')
3668 'uploader': uploader
3672 class RBMARadioIE(InfoExtractor):
3673 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3675 def _real_extract(self, url):
3676 m = re.match(self._VALID_URL, url)
3677 video_id = m.group('videoID')
3679 webpage = self._download_webpage(url, video_id)
3680 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3682 raise ExtractorError(u'Cannot find metadata')
3683 json_data = m.group(1)
3686 data = json.loads(json_data)
3687 except ValueError as e:
3688 raise ExtractorError(u'Invalid JSON: ' + str(e))
3690 video_url = data['akamai_url'] + '&cbr=256'
3691 url_parts = compat_urllib_parse_urlparse(video_url)
3692 video_ext = url_parts.path.rpartition('.')[2]
3697 'title': data['title'],
3698 'description': data.get('teaser_text'),
3699 'location': data.get('country_of_origin'),
3700 'uploader': data.get('host', {}).get('name'),
3701 'uploader_id': data.get('host', {}).get('slug'),
3702 'thumbnail': data.get('image', {}).get('large_url_2x'),
3703 'duration': data.get('duration'),
3708 class YouPornIE(InfoExtractor):
3709 """Information extractor for youporn.com."""
3710 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3712 def _print_formats(self, formats):
3713 """Print all available formats"""
3714 print(u'Available formats:')
3715 print(u'ext\t\tformat')
3716 print(u'---------------------------------')
3717 for format in formats:
3718 print(u'%s\t\t%s' % (format['ext'], format['format']))
3720 def _specific(self, req_format, formats):
3722 if(x["format"]==req_format):
3726 def _real_extract(self, url):
3727 mobj = re.match(self._VALID_URL, url)
3729 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3732 video_id = mobj.group('videoid')
3734 req = compat_urllib_request.Request(url)
3735 req.add_header('Cookie', 'age_verified=1')
3736 webpage = self._download_webpage(req, video_id)
3738 # Get the video title
3739 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3741 raise ExtractorError(u'Unable to extract video title')
3742 video_title = result.group('title').strip()
3744 # Get the video date
3745 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3747 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3750 upload_date = result.group('date').strip()
3752 # Get the video uploader
3753 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3755 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3756 video_uploader = None
3758 video_uploader = result.group('uploader').strip()
3759 video_uploader = clean_html( video_uploader )
3761 # Get all of the formats available
3762 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3763 result = re.search(DOWNLOAD_LIST_RE, webpage)
3765 raise ExtractorError(u'Unable to extract download list')
3766 download_list_html = result.group('download_list').strip()
3768 # Get all of the links from the page
3769 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3770 links = re.findall(LINK_RE, download_list_html)
3771 if(len(links) == 0):
3772 raise ExtractorError(u'ERROR: no known formats available for video')
3774 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3779 # A link looks like this:
3780 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3781 # A path looks like this:
3782 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3783 video_url = unescapeHTML( link )
3784 path = compat_urllib_parse_urlparse( video_url ).path
3785 extension = os.path.splitext( path )[1][1:]
3786 format = path.split('/')[4].split('_')[:2]
3789 format = "-".join( format )
3790 title = u'%s-%s-%s' % (video_title, size, bitrate)
3795 'uploader': video_uploader,
3796 'upload_date': upload_date,
3801 'description': None,
3805 if self._downloader.params.get('listformats', None):
3806 self._print_formats(formats)
3809 req_format = self._downloader.params.get('format', None)
3810 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3812 if req_format is None or req_format == 'best':
3814 elif req_format == 'worst':
3815 return [formats[-1]]
3816 elif req_format in ('-1', 'all'):
3819 format = self._specific( req_format, formats )
3821 self._downloader.trouble(u'ERROR: requested format not available')
3827 class PornotubeIE(InfoExtractor):
3828 """Information extractor for pornotube.com."""
3829 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3831 def _real_extract(self, url):
3832 mobj = re.match(self._VALID_URL, url)
3834 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3837 video_id = mobj.group('videoid')
3838 video_title = mobj.group('title')
3840 # Get webpage content
3841 webpage = self._download_webpage(url, video_id)
3844 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3845 result = re.search(VIDEO_URL_RE, webpage)
3847 self._downloader.trouble(u'ERROR: unable to extract video url')
3849 video_url = compat_urllib_parse.unquote(result.group('url'))
3851 #Get the uploaded date
3852 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3853 result = re.search(VIDEO_UPLOADED_RE, webpage)
3855 self._downloader.trouble(u'ERROR: unable to extract video title')
3857 upload_date = result.group('date')
3859 info = {'id': video_id,
3862 'upload_date': upload_date,
3863 'title': video_title,
3869 class YouJizzIE(InfoExtractor):
3870 """Information extractor for youjizz.com."""
3871 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3873 def _real_extract(self, url):
3874 mobj = re.match(self._VALID_URL, url)
3876 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3879 video_id = mobj.group('videoid')
3881 # Get webpage content
3882 webpage = self._download_webpage(url, video_id)
3884 # Get the video title
3885 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3887 raise ExtractorError(u'ERROR: unable to extract video title')
3888 video_title = result.group('title').strip()
3890 # Get the embed page
3891 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3893 raise ExtractorError(u'ERROR: unable to extract embed page')
3895 embed_page_url = result.group(0).strip()
3896 video_id = result.group('videoid')
3898 webpage = self._download_webpage(embed_page_url, video_id)
3901 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3903 raise ExtractorError(u'ERROR: unable to extract video url')
3904 video_url = result.group('source')
3906 info = {'id': video_id,
3908 'title': video_title,
3911 'player_url': embed_page_url}
3915 class EightTracksIE(InfoExtractor):
3917 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3919 def _real_extract(self, url):
3920 mobj = re.match(self._VALID_URL, url)
3922 raise ExtractorError(u'Invalid URL: %s' % url)
3923 playlist_id = mobj.group('id')
3925 webpage = self._download_webpage(url, playlist_id)
3927 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3929 raise ExtractorError(u'Cannot find trax information')
3930 json_like = m.group(1)
3931 data = json.loads(json_like)
3933 session = str(random.randint(0, 1000000000))
3935 track_count = data['tracks_count']
3936 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3937 next_url = first_url
3939 for i in itertools.count():
3940 api_json = self._download_webpage(next_url, playlist_id,
3941 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3942 errnote=u'Failed to download song information')
3943 api_data = json.loads(api_json)
3944 track_data = api_data[u'set']['track']
3946 'id': track_data['id'],
3947 'url': track_data['track_file_stream_url'],
3948 'title': track_data['performer'] + u' - ' + track_data['name'],
3949 'raw_title': track_data['name'],
3950 'uploader_id': data['user']['login'],
3954 if api_data['set']['at_last_track']:
3956 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3959 class KeekIE(InfoExtractor):
3960 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3963 def _real_extract(self, url):
3964 m = re.match(self._VALID_URL, url)
3965 video_id = m.group('videoID')
3966 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3967 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3968 webpage = self._download_webpage(url, video_id)
3969 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3970 title = unescapeHTML(m.group('title'))
3971 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3972 uploader = unescapeHTML(m.group('uploader'))
3978 'thumbnail': thumbnail,
3979 'uploader': uploader
3983 class TEDIE(InfoExtractor):
3984 _VALID_URL=r'''http://www.ted.com/
3986 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3988 ((?P<type_talk>talks)) # We have a simple talk
3990 /(?P<name>\w+) # Here goes the name and then ".html"
3993 def suitable(self, url):
3994 """Receives a URL and returns True if suitable for this IE."""
3995 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3997 def _real_extract(self, url):
3998 m=re.match(self._VALID_URL, url, re.VERBOSE)
3999 if m.group('type_talk'):
4000 return [self._talk_info(url)]
4002 playlist_id=m.group('playlist_id')
4003 name=m.group('name')
4004 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4005 return self._playlist_videos_info(url,name,playlist_id)
4007 def _talk_video_link(self,mediaSlug):
4008 '''Returns the video link for that mediaSlug'''
4009 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4011 def _playlist_videos_info(self,url,name,playlist_id=0):
4012 '''Returns the videos of the playlist'''
4014 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4015 ([.\s]*?)data-playlist_item_id="(\d+)"
4016 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4018 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4019 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4020 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4021 m_names=re.finditer(video_name_RE,webpage)
4023 for m_video, m_name in zip(m_videos,m_names):
4024 video_id=m_video.group('video_id')
4025 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4026 info.append(self._talk_info(talk_url,video_id))
4029 def _talk_info(self, url, video_id=0):
4030 """Return the video for the talk in the url"""
4031 m=re.match(self._VALID_URL, url,re.VERBOSE)
4032 videoName=m.group('name')
4033 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4034 # If the url includes the language we get the title translated
4035 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4036 title=re.search(title_RE, webpage).group('title')
4037 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4038 "id":(?P<videoID>[\d]+).*?
4039 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4040 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4041 thumb_match=re.search(thumb_RE,webpage)
4042 info_match=re.search(info_RE,webpage,re.VERBOSE)
4043 video_id=info_match.group('videoID')
4044 mediaSlug=info_match.group('mediaSlug')
4045 video_url=self._talk_video_link(mediaSlug)
4051 'thumbnail': thumb_match.group('thumbnail')
4055 class MySpassIE(InfoExtractor):
4056 _VALID_URL = r'http://www.myspass.de/.*'
4058 def _real_extract(self, url):
4059 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4061 # video id is the last path element of the URL
4062 # usually there is a trailing slash, so also try the second but last
4063 url_path = compat_urllib_parse_urlparse(url).path
4064 url_parent_path, video_id = os.path.split(url_path)
4066 _, video_id = os.path.split(url_parent_path)
4069 metadata_url = META_DATA_URL_TEMPLATE % video_id
4070 metadata_text = self._download_webpage(metadata_url, video_id)
4071 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4073 # extract values from metadata
4074 url_flv_el = metadata.find('url_flv')
4075 if url_flv_el is None:
4076 self._downloader.trouble(u'ERROR: unable to extract download url')
4078 video_url = url_flv_el.text
4079 extension = os.path.splitext(video_url)[1][1:]
4080 title_el = metadata.find('title')
4081 if title_el is None:
4082 self._downloader.trouble(u'ERROR: unable to extract title')
4084 title = title_el.text
4085 format_id_el = metadata.find('format_id')
4086 if format_id_el is None:
4089 format = format_id_el.text
4090 description_el = metadata.find('description')
4091 if description_el is not None:
4092 description = description_el.text
4095 imagePreview_el = metadata.find('imagePreview')
4096 if imagePreview_el is not None:
4097 thumbnail = imagePreview_el.text
4106 'thumbnail': thumbnail,
4107 'description': description
4111 def gen_extractors():
4112 """ Return a list of an instance of every supported extractor.
4113 The order does matter; the first extractor matched is the one handling the URL.
4116 YoutubePlaylistIE(),
4140 StanfordOpenClassroomIE(),