2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
323 request = compat_urllib_request.Request(self._LOGIN_URL)
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
334 galx = match.group(1)
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
346 u'PersistentCookie': u'yes',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
356 u'signIn': u'Sign in',
358 u'service': u'youtube',
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
380 'action_confirm': 'Confirm',
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
384 self.report_age_confirmation()
385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
390 def _extract_id(self, url):
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
395 video_id = mobj.group(2)
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
406 self.report_video_webpage_download(video_id)
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
451 # Start extracting information
452 self.report_information_extraction(video_id)
455 if 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
461 video_uploader_id = None
462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
464 video_uploader_id = mobj.group(1)
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
498 video_description = ''
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
505 self._downloader.trouble(srt_error)
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
554 video_url_list = [(rf, url_map[rf])]
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
564 for format_param, video_real_url in video_url_list:
566 video_extension = self._video_extensions.get(format_param, 'flv')
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
573 'url': video_real_url,
574 'uploader': video_uploader,
575 'uploader_id': video_uploader_id,
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
589 class MetacafeIE(InfoExtractor):
590 """Information Extractor for metacafe.com."""
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
629 'submit': "Continue - I'm over 18",
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 video_id = mobj.group(1)
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
696 self._downloader.trouble(u'ERROR: unable to extract title')
698 video_title = mobj.group(1).decode('utf-8')
700 mobj = re.search(r'submitter=(.*?);', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
704 video_uploader = mobj.group(1)
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
716 class DailymotionIE(InfoExtractor):
717 """Information Extractor for Dailymotion"""
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
722 def __init__(self, downloader=None):
723 InfoExtractor.__init__(self, downloader)
725 def report_extraction(self, video_id):
726 """Report information extraction."""
727 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
729 def _real_extract(self, url):
730 # Extract id and simplified title from URL
731 mobj = re.match(self._VALID_URL, url)
733 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
736 video_id = mobj.group(1).split('_')[0].split('?')[0]
738 video_extension = 'mp4'
740 # Retrieve video webpage to extract further information
741 request = compat_urllib_request.Request(url)
742 request.add_header('Cookie', 'family_filter=off')
743 webpage = self._download_webpage(request, video_id)
745 # Extract URL, uploader and title from webpage
746 self.report_extraction(video_id)
747 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
749 self._downloader.trouble(u'ERROR: unable to extract media URL')
751 flashvars = compat_urllib_parse.unquote(mobj.group(1))
753 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
756 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
759 self._downloader.trouble(u'ERROR: unable to extract video URL')
762 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
764 self._downloader.trouble(u'ERROR: unable to extract video URL')
767 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
769 # TODO: support choosing qualities
771 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
773 self._downloader.trouble(u'ERROR: unable to extract title')
775 video_title = unescapeHTML(mobj.group('title'))
777 video_uploader = None
778 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
780 # lookin for official user
781 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
782 if mobj_official is None:
783 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
785 video_uploader = mobj_official.group(1)
787 video_uploader = mobj.group(1)
789 video_upload_date = None
790 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
792 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
797 'uploader': video_uploader,
798 'upload_date': video_upload_date,
799 'title': video_title,
800 'ext': video_extension,
804 class PhotobucketIE(InfoExtractor):
805 """Information extractor for photobucket.com."""
807 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
808 IE_NAME = u'photobucket'
810 def __init__(self, downloader=None):
811 InfoExtractor.__init__(self, downloader)
813 def report_download_webpage(self, video_id):
814 """Report webpage download."""
815 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
817 def report_extraction(self, video_id):
818 """Report information extraction."""
819 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
821 def _real_extract(self, url):
822 # Extract id from URL
823 mobj = re.match(self._VALID_URL, url)
825 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
828 video_id = mobj.group(1)
830 video_extension = 'flv'
832 # Retrieve video webpage to extract further information
833 request = compat_urllib_request.Request(url)
835 self.report_download_webpage(video_id)
836 webpage = compat_urllib_request.urlopen(request).read()
837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
838 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
841 # Extract URL, uploader, and title from webpage
842 self.report_extraction(video_id)
843 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
845 self._downloader.trouble(u'ERROR: unable to extract media URL')
847 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
851 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
853 self._downloader.trouble(u'ERROR: unable to extract title')
855 video_title = mobj.group(1).decode('utf-8')
857 video_uploader = mobj.group(2).decode('utf-8')
860 'id': video_id.decode('utf-8'),
861 'url': video_url.decode('utf-8'),
862 'uploader': video_uploader,
864 'title': video_title,
865 'ext': video_extension.decode('utf-8'),
869 class YahooIE(InfoExtractor):
870 """Information extractor for video.yahoo.com."""
873 # _VALID_URL matches all Yahoo! Video URLs
874 # _VPAGE_URL matches only the extractable '/watch/' URLs
875 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
876 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
877 IE_NAME = u'video.yahoo'
879 def __init__(self, downloader=None):
880 InfoExtractor.__init__(self, downloader)
882 def report_download_webpage(self, video_id):
883 """Report webpage download."""
884 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
886 def report_extraction(self, video_id):
887 """Report information extraction."""
888 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
890 def _real_extract(self, url, new_video=True):
891 # Extract ID from URL
892 mobj = re.match(self._VALID_URL, url)
894 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
897 video_id = mobj.group(2)
898 video_extension = 'flv'
900 # Rewrite valid but non-extractable URLs as
901 # extractable English language /watch/ URLs
902 if re.match(self._VPAGE_URL, url) is None:
903 request = compat_urllib_request.Request(url)
905 webpage = compat_urllib_request.urlopen(request).read()
906 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
907 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
910 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
912 self._downloader.trouble(u'ERROR: Unable to extract id field')
914 yahoo_id = mobj.group(1)
916 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
918 self._downloader.trouble(u'ERROR: Unable to extract vid field')
920 yahoo_vid = mobj.group(1)
922 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
923 return self._real_extract(url, new_video=False)
925 # Retrieve video webpage to extract further information
926 request = compat_urllib_request.Request(url)
928 self.report_download_webpage(video_id)
929 webpage = compat_urllib_request.urlopen(request).read()
930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
931 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
934 # Extract uploader and title from webpage
935 self.report_extraction(video_id)
936 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
938 self._downloader.trouble(u'ERROR: unable to extract video title')
940 video_title = mobj.group(1).decode('utf-8')
942 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
944 self._downloader.trouble(u'ERROR: unable to extract video uploader')
946 video_uploader = mobj.group(1).decode('utf-8')
948 # Extract video thumbnail
949 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
951 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
953 video_thumbnail = mobj.group(1).decode('utf-8')
955 # Extract video description
956 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
958 self._downloader.trouble(u'ERROR: unable to extract video description')
960 video_description = mobj.group(1).decode('utf-8')
961 if not video_description:
962 video_description = 'No description available.'
964 # Extract video height and width
965 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
967 self._downloader.trouble(u'ERROR: unable to extract video height')
969 yv_video_height = mobj.group(1)
971 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
973 self._downloader.trouble(u'ERROR: unable to extract video width')
975 yv_video_width = mobj.group(1)
977 # Retrieve video playlist to extract media URL
978 # I'm not completely sure what all these options are, but we
979 # seem to need most of them, otherwise the server sends a 401.
980 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
981 yv_bitrate = '700' # according to Wikipedia this is hard-coded
982 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
983 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
984 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
986 self.report_download_webpage(video_id)
987 webpage = compat_urllib_request.urlopen(request).read()
988 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
989 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
992 # Extract media URL from playlist XML
993 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
995 self._downloader.trouble(u'ERROR: Unable to extract media URL')
997 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
998 video_url = unescapeHTML(video_url)
1001 'id': video_id.decode('utf-8'),
1003 'uploader': video_uploader,
1004 'upload_date': None,
1005 'title': video_title,
1006 'ext': video_extension.decode('utf-8'),
1007 'thumbnail': video_thumbnail.decode('utf-8'),
1008 'description': video_description,
1012 class VimeoIE(InfoExtractor):
1013 """Information extractor for vimeo.com."""
1015 # _VALID_URL matches Vimeo URLs
1016 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1019 def __init__(self, downloader=None):
1020 InfoExtractor.__init__(self, downloader)
1022 def report_download_webpage(self, video_id):
1023 """Report webpage download."""
1024 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1026 def report_extraction(self, video_id):
1027 """Report information extraction."""
1028 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1030 def _real_extract(self, url, new_video=True):
1031 # Extract ID from URL
1032 mobj = re.match(self._VALID_URL, url)
1034 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1037 video_id = mobj.group('id')
1038 if not mobj.group('proto'):
1039 url = 'https://' + url
1040 if mobj.group('direct_link'):
1041 url = 'https://vimeo.com/' + video_id
1043 # Retrieve video webpage to extract further information
1044 request = compat_urllib_request.Request(url, None, std_headers)
1046 self.report_download_webpage(video_id)
1047 webpage_bytes = compat_urllib_request.urlopen(request).read()
1048 webpage = webpage_bytes.decode('utf-8')
1049 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1050 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1053 # Now we begin extracting as much information as we can from what we
1054 # retrieved. First we extract the information common to all extractors,
1055 # and latter we extract those that are Vimeo specific.
1056 self.report_extraction(video_id)
1058 # Extract the config JSON
1060 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061 config = json.loads(config)
1063 self._downloader.trouble(u'ERROR: unable to extract info section')
1067 video_title = config["video"]["title"]
1069 # Extract uploader and uploader_id
1070 video_uploader = config["video"]["owner"]["name"]
1071 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1073 # Extract video thumbnail
1074 video_thumbnail = config["video"]["thumbnail"]
1076 # Extract video description
1077 video_description = get_element_by_attribute("itemprop", "description", webpage)
1078 if video_description: video_description = clean_html(video_description)
1079 else: video_description = ''
1081 # Extract upload date
1082 video_upload_date = None
1083 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1084 if mobj is not None:
1085 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1087 # Vimeo specific: extract request signature and timestamp
1088 sig = config['request']['signature']
1089 timestamp = config['request']['timestamp']
1091 # Vimeo specific: extract video codec and quality information
1092 # First consider quality, then codecs, then take everything
1093 # TODO bind to format param
1094 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1095 files = { 'hd': [], 'sd': [], 'other': []}
1096 for codec_name, codec_extension in codecs:
1097 if codec_name in config["video"]["files"]:
1098 if 'hd' in config["video"]["files"][codec_name]:
1099 files['hd'].append((codec_name, codec_extension, 'hd'))
1100 elif 'sd' in config["video"]["files"][codec_name]:
1101 files['sd'].append((codec_name, codec_extension, 'sd'))
1103 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1105 for quality in ('hd', 'sd', 'other'):
1106 if len(files[quality]) > 0:
1107 video_quality = files[quality][0][2]
1108 video_codec = files[quality][0][0]
1109 video_extension = files[quality][0][1]
1110 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1113 self._downloader.trouble(u'ERROR: no known codec found')
1116 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1117 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1122 'uploader': video_uploader,
1123 'uploader_id': video_uploader_id,
1124 'upload_date': video_upload_date,
1125 'title': video_title,
1126 'ext': video_extension,
1127 'thumbnail': video_thumbnail,
1128 'description': video_description,
1132 class ArteTvIE(InfoExtractor):
1133 """arte.tv information extractor."""
1135 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1136 _LIVE_URL = r'index-[0-9]+\.html$'
1138 IE_NAME = u'arte.tv'
1140 def __init__(self, downloader=None):
1141 InfoExtractor.__init__(self, downloader)
1143 def report_download_webpage(self, video_id):
1144 """Report webpage download."""
1145 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1147 def report_extraction(self, video_id):
1148 """Report information extraction."""
1149 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1151 def fetch_webpage(self, url):
1152 request = compat_urllib_request.Request(url)
1154 self.report_download_webpage(url)
1155 webpage = compat_urllib_request.urlopen(request).read()
1156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1157 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1159 except ValueError as err:
1160 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1164 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1165 page = self.fetch_webpage(url)
1166 mobj = re.search(regex, page, regexFlags)
1170 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1173 for (i, key, err) in matchTuples:
1174 if mobj.group(i) is None:
1175 self._downloader.trouble(err)
1178 info[key] = mobj.group(i)
1182 def extractLiveStream(self, url):
1183 video_lang = url.split('/')[-4]
1184 info = self.grep_webpage(
1186 r'src="(.*?/videothek_js.*?\.js)',
1189 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1192 http_host = url.split('/')[2]
1193 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1194 info = self.grep_webpage(
1196 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1197 '(http://.*?\.swf).*?' +
1201 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1202 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1203 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1206 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1208 def extractPlus7Stream(self, url):
1209 video_lang = url.split('/')[-3]
1210 info = self.grep_webpage(
1212 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1215 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1218 next_url = compat_urllib_parse.unquote(info.get('url'))
1219 info = self.grep_webpage(
1221 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1224 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1227 next_url = compat_urllib_parse.unquote(info.get('url'))
1229 info = self.grep_webpage(
1231 r'<video id="(.*?)".*?>.*?' +
1232 '<name>(.*?)</name>.*?' +
1233 '<dateVideo>(.*?)</dateVideo>.*?' +
1234 '<url quality="hd">(.*?)</url>',
1237 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1238 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1239 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1240 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1245 'id': info.get('id'),
1246 'url': compat_urllib_parse.unquote(info.get('url')),
1247 'uploader': u'arte.tv',
1248 'upload_date': info.get('date'),
1249 'title': info.get('title').decode('utf-8'),
1255 def _real_extract(self, url):
1256 video_id = url.split('/')[-1]
1257 self.report_extraction(video_id)
1259 if re.search(self._LIVE_URL, video_id) is not None:
1260 self.extractLiveStream(url)
1263 info = self.extractPlus7Stream(url)
1268 class GenericIE(InfoExtractor):
1269 """Generic last-resort information extractor."""
1272 IE_NAME = u'generic'
1274 def __init__(self, downloader=None):
1275 InfoExtractor.__init__(self, downloader)
1277 def report_download_webpage(self, video_id):
1278 """Report webpage download."""
1279 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1280 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1282 def report_extraction(self, video_id):
1283 """Report information extraction."""
1284 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1286 def report_following_redirect(self, new_url):
1287 """Report information extraction."""
1288 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1290 def _test_redirect(self, url):
1291 """Check if it is a redirect, like url shorteners, in case restart chain."""
1292 class HeadRequest(compat_urllib_request.Request):
1293 def get_method(self):
1296 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1298 Subclass the HTTPRedirectHandler to make it use our
1299 HeadRequest also on the redirected URL
1301 def redirect_request(self, req, fp, code, msg, headers, newurl):
1302 if code in (301, 302, 303, 307):
1303 newurl = newurl.replace(' ', '%20')
1304 newheaders = dict((k,v) for k,v in req.headers.items()
1305 if k.lower() not in ("content-length", "content-type"))
1306 return HeadRequest(newurl,
1308 origin_req_host=req.get_origin_req_host(),
1311 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1313 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1315 Fallback to GET if HEAD is not allowed (405 HTTP error)
1317 def http_error_405(self, req, fp, code, msg, headers):
1321 newheaders = dict((k,v) for k,v in req.headers.items()
1322 if k.lower() not in ("content-length", "content-type"))
1323 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1325 origin_req_host=req.get_origin_req_host(),
1329 opener = compat_urllib_request.OpenerDirector()
1330 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1331 HTTPMethodFallback, HEADRedirectHandler,
1332 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1333 opener.add_handler(handler())
1335 response = opener.open(HeadRequest(url))
1336 new_url = response.geturl()
1341 self.report_following_redirect(new_url)
1342 self._downloader.download([new_url])
1345 def _real_extract(self, url):
1346 if self._test_redirect(url): return
1348 video_id = url.split('/')[-1]
1349 request = compat_urllib_request.Request(url)
1351 self.report_download_webpage(video_id)
1352 webpage = compat_urllib_request.urlopen(request).read()
1353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1354 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1356 except ValueError as err:
1357 # since this is the last-resort InfoExtractor, if
1358 # this error is thrown, it'll be thrown here
1359 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1362 self.report_extraction(video_id)
1363 # Start with something easy: JW Player in SWFObject
1364 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1366 # Broaden the search a little bit
1367 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1372 # It's possible that one of the regexes
1373 # matched, but returned an empty group:
1374 if mobj.group(1) is None:
1375 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378 video_url = compat_urllib_parse.unquote(mobj.group(1))
1379 video_id = os.path.basename(video_url)
1381 # here's a fun little line of code for you:
1382 video_extension = os.path.splitext(video_id)[1][1:]
1383 video_id = os.path.splitext(video_id)[0]
1385 # it's tempting to parse this further, but you would
1386 # have to take into account all the variations like
1387 # Video Title - Site Name
1388 # Site Name | Video Title
1389 # Video Title - Tagline | Site Name
1390 # and so on and so forth; it's just not practical
1391 mobj = re.search(r'<title>(.*)</title>', webpage)
1393 self._downloader.trouble(u'ERROR: unable to extract title')
1395 video_title = mobj.group(1)
1397 # video uploader is domain name
1398 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1400 self._downloader.trouble(u'ERROR: unable to extract title')
1402 video_uploader = mobj.group(1)
1407 'uploader': video_uploader,
1408 'upload_date': None,
1409 'title': video_title,
1410 'ext': video_extension,
1414 class YoutubeSearchIE(InfoExtractor):
1415 """Information Extractor for YouTube search queries."""
1416 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1417 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1418 _max_youtube_results = 1000
1419 IE_NAME = u'youtube:search'
1421 def __init__(self, downloader=None):
1422 InfoExtractor.__init__(self, downloader)
1424 def report_download_page(self, query, pagenum):
1425 """Report attempt to download search page with given number."""
1426 query = query.decode(preferredencoding())
1427 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1429 def _real_extract(self, query):
1430 mobj = re.match(self._VALID_URL, query)
1432 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1435 prefix, query = query.split(':')
1437 query = query.encode('utf-8')
1439 self._download_n_results(query, 1)
1441 elif prefix == 'all':
1442 self._download_n_results(query, self._max_youtube_results)
1448 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1450 elif n > self._max_youtube_results:
1451 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1452 n = self._max_youtube_results
1453 self._download_n_results(query, n)
1455 except ValueError: # parsing prefix as integer fails
1456 self._download_n_results(query, 1)
1459 def _download_n_results(self, query, n):
1460 """Downloads a specified number of results for a query"""
1466 while (50 * pagenum) < limit:
1467 self.report_download_page(query, pagenum+1)
1468 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1469 request = compat_urllib_request.Request(result_url)
1471 data = compat_urllib_request.urlopen(request).read()
1472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1473 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1475 api_response = json.loads(data)['data']
1477 new_ids = list(video['id'] for video in api_response['items'])
1478 video_ids += new_ids
1480 limit = min(n, api_response['totalItems'])
1483 if len(video_ids) > n:
1484 video_ids = video_ids[:n]
1485 for id in video_ids:
1486 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1490 class GoogleSearchIE(InfoExtractor):
1491 """Information Extractor for Google Video search queries."""
1492 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1493 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1494 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1495 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1496 _max_google_results = 1000
1497 IE_NAME = u'video.google:search'
1499 def __init__(self, downloader=None):
1500 InfoExtractor.__init__(self, downloader)
1502 def report_download_page(self, query, pagenum):
1503 """Report attempt to download playlist page with given number."""
1504 query = query.decode(preferredencoding())
1505 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1507 def _real_extract(self, query):
1508 mobj = re.match(self._VALID_URL, query)
1510 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1513 prefix, query = query.split(':')
1515 query = query.encode('utf-8')
1517 self._download_n_results(query, 1)
1519 elif prefix == 'all':
1520 self._download_n_results(query, self._max_google_results)
1526 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1528 elif n > self._max_google_results:
1529 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1530 n = self._max_google_results
1531 self._download_n_results(query, n)
1533 except ValueError: # parsing prefix as integer fails
1534 self._download_n_results(query, 1)
1537 def _download_n_results(self, query, n):
1538 """Downloads a specified number of results for a query"""
1544 self.report_download_page(query, pagenum)
1545 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1546 request = compat_urllib_request.Request(result_url)
1548 page = compat_urllib_request.urlopen(request).read()
1549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1550 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1553 # Extract video identifiers
1554 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1555 video_id = mobj.group(1)
1556 if video_id not in video_ids:
1557 video_ids.append(video_id)
1558 if len(video_ids) == n:
1559 # Specified n videos reached
1560 for id in video_ids:
1561 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1564 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1565 for id in video_ids:
1566 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1569 pagenum = pagenum + 1
1572 class YahooSearchIE(InfoExtractor):
1573 """Information Extractor for Yahoo! Video search queries."""
1576 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1577 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1578 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1579 _MORE_PAGES_INDICATOR = r'\s*Next'
1580 _max_yahoo_results = 1000
1581 IE_NAME = u'video.yahoo:search'
1583 def __init__(self, downloader=None):
1584 InfoExtractor.__init__(self, downloader)
1586 def report_download_page(self, query, pagenum):
1587 """Report attempt to download playlist page with given number."""
1588 query = query.decode(preferredencoding())
1589 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1591 def _real_extract(self, query):
1592 mobj = re.match(self._VALID_URL, query)
1594 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1597 prefix, query = query.split(':')
1599 query = query.encode('utf-8')
1601 self._download_n_results(query, 1)
1603 elif prefix == 'all':
1604 self._download_n_results(query, self._max_yahoo_results)
1610 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1612 elif n > self._max_yahoo_results:
1613 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1614 n = self._max_yahoo_results
1615 self._download_n_results(query, n)
1617 except ValueError: # parsing prefix as integer fails
1618 self._download_n_results(query, 1)
1621 def _download_n_results(self, query, n):
1622 """Downloads a specified number of results for a query"""
1625 already_seen = set()
1629 self.report_download_page(query, pagenum)
1630 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1631 request = compat_urllib_request.Request(result_url)
1633 page = compat_urllib_request.urlopen(request).read()
1634 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1635 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1638 # Extract video identifiers
1639 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1640 video_id = mobj.group(1)
1641 if video_id not in already_seen:
1642 video_ids.append(video_id)
1643 already_seen.add(video_id)
1644 if len(video_ids) == n:
1645 # Specified n videos reached
1646 for id in video_ids:
1647 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1650 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1651 for id in video_ids:
1652 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655 pagenum = pagenum + 1
1658 class YoutubePlaylistIE(InfoExtractor):
1659 """Information Extractor for YouTube playlists."""
1661 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1662 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1663 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1664 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1665 IE_NAME = u'youtube:playlist'
1667 def __init__(self, downloader=None):
1668 InfoExtractor.__init__(self, downloader)
1670 def report_download_page(self, playlist_id, pagenum):
1671 """Report attempt to download playlist page with given number."""
1672 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1674 def _real_extract(self, url):
1675 # Extract playlist id
1676 mobj = re.match(self._VALID_URL, url)
1678 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1682 if mobj.group(3) is not None:
1683 self._downloader.download([mobj.group(3)])
1686 # Download playlist pages
1687 # prefix is 'p' as default for playlists but there are other types that need extra care
1688 playlist_prefix = mobj.group(1)
1689 if playlist_prefix == 'a':
1690 playlist_access = 'artist'
1692 playlist_prefix = 'p'
1693 playlist_access = 'view_play_list'
1694 playlist_id = mobj.group(2)
1699 self.report_download_page(playlist_id, pagenum)
1700 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1701 request = compat_urllib_request.Request(url)
1703 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1705 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1708 # Extract video identifiers
1710 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1711 if mobj.group(1) not in ids_in_page:
1712 ids_in_page.append(mobj.group(1))
1713 video_ids.extend(ids_in_page)
1715 if self._MORE_PAGES_INDICATOR not in page:
1717 pagenum = pagenum + 1
1719 total = len(video_ids)
1721 playliststart = self._downloader.params.get('playliststart', 1) - 1
1722 playlistend = self._downloader.params.get('playlistend', -1)
1723 if playlistend == -1:
1724 video_ids = video_ids[playliststart:]
1726 video_ids = video_ids[playliststart:playlistend]
1728 if len(video_ids) == total:
1729 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1731 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1733 for id in video_ids:
1734 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1738 class YoutubeChannelIE(InfoExtractor):
1739 """Information Extractor for YouTube channels."""
1741 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1742 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1743 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1744 IE_NAME = u'youtube:channel'
1746 def report_download_page(self, channel_id, pagenum):
1747 """Report attempt to download channel page with given number."""
1748 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1750 def _real_extract(self, url):
1751 # Extract channel id
1752 mobj = re.match(self._VALID_URL, url)
1754 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 # Download channel pages
1758 channel_id = mobj.group(1)
1763 self.report_download_page(channel_id, pagenum)
1764 url = self._TEMPLATE_URL % (channel_id, pagenum)
1765 request = compat_urllib_request.Request(url)
1767 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1769 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1772 # Extract video identifiers
1774 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1775 if mobj.group(1) not in ids_in_page:
1776 ids_in_page.append(mobj.group(1))
1777 video_ids.extend(ids_in_page)
1779 if self._MORE_PAGES_INDICATOR not in page:
1781 pagenum = pagenum + 1
1783 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1785 for id in video_ids:
1786 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1790 class YoutubeUserIE(InfoExtractor):
1791 """Information Extractor for YouTube users."""
1793 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1794 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1795 _GDATA_PAGE_SIZE = 50
1796 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1797 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1798 IE_NAME = u'youtube:user'
1800 def __init__(self, downloader=None):
1801 InfoExtractor.__init__(self, downloader)
1803 def report_download_page(self, username, start_index):
1804 """Report attempt to download user page."""
1805 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1806 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1808 def _real_extract(self, url):
1810 mobj = re.match(self._VALID_URL, url)
1812 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1815 username = mobj.group(1)
1817 # Download video ids using YouTube Data API. Result size per
1818 # query is limited (currently to 50 videos) so we need to query
1819 # page by page until there are no video ids - it means we got
1826 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1827 self.report_download_page(username, start_index)
1829 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1832 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1837 # Extract video identifiers
1840 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1841 if mobj.group(1) not in ids_in_page:
1842 ids_in_page.append(mobj.group(1))
1844 video_ids.extend(ids_in_page)
1846 # A little optimization - if current page is not
1847 # "full", ie. does not contain PAGE_SIZE video ids then
1848 # we can assume that this page is the last one - there
1849 # are no more ids on further pages - no need to query
1852 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1857 all_ids_count = len(video_ids)
1858 playliststart = self._downloader.params.get('playliststart', 1) - 1
1859 playlistend = self._downloader.params.get('playlistend', -1)
1861 if playlistend == -1:
1862 video_ids = video_ids[playliststart:]
1864 video_ids = video_ids[playliststart:playlistend]
1866 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1867 (username, all_ids_count, len(video_ids)))
1869 for video_id in video_ids:
1870 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1873 class BlipTVUserIE(InfoExtractor):
1874 """Information Extractor for blip.tv users."""
1876 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1878 IE_NAME = u'blip.tv:user'
1880 def __init__(self, downloader=None):
1881 InfoExtractor.__init__(self, downloader)
1883 def report_download_page(self, username, pagenum):
1884 """Report attempt to download user page."""
1885 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1886 (self.IE_NAME, username, pagenum))
1888 def _real_extract(self, url):
1890 mobj = re.match(self._VALID_URL, url)
1892 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1895 username = mobj.group(1)
1897 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1899 request = compat_urllib_request.Request(url)
1902 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1903 mobj = re.search(r'data-users-id="([^"]+)"', page)
1904 page_base = page_base % mobj.group(1)
1905 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1906 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1910 # Download video ids using BlipTV Ajax calls. Result size per
1911 # query is limited (currently to 12 videos) so we need to query
1912 # page by page until there are no video ids - it means we got
1919 self.report_download_page(username, pagenum)
1921 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1924 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1926 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1929 # Extract video identifiers
1932 for mobj in re.finditer(r'href="/([^"]+)"', page):
1933 if mobj.group(1) not in ids_in_page:
1934 ids_in_page.append(unescapeHTML(mobj.group(1)))
1936 video_ids.extend(ids_in_page)
1938 # A little optimization - if current page is not
1939 # "full", ie. does not contain PAGE_SIZE video ids then
1940 # we can assume that this page is the last one - there
1941 # are no more ids on further pages - no need to query
1944 if len(ids_in_page) < self._PAGE_SIZE:
1949 all_ids_count = len(video_ids)
1950 playliststart = self._downloader.params.get('playliststart', 1) - 1
1951 playlistend = self._downloader.params.get('playlistend', -1)
1953 if playlistend == -1:
1954 video_ids = video_ids[playliststart:]
1956 video_ids = video_ids[playliststart:playlistend]
1958 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1959 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1961 for video_id in video_ids:
1962 self._downloader.download([u'http://blip.tv/'+video_id])
1965 class DepositFilesIE(InfoExtractor):
1966 """Information extractor for depositfiles.com"""
1968 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1970 def report_download_webpage(self, file_id):
1971 """Report webpage download."""
1972 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1974 def report_extraction(self, file_id):
1975 """Report information extraction."""
1976 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1978 def _real_extract(self, url):
1979 file_id = url.split('/')[-1]
1980 # Rebuild url in english locale
1981 url = 'http://depositfiles.com/en/files/' + file_id
1983 # Retrieve file webpage with 'Free download' button pressed
1984 free_download_indication = { 'gateway_result' : '1' }
1985 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1987 self.report_download_webpage(file_id)
1988 webpage = compat_urllib_request.urlopen(request).read()
1989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1993 # Search for the real file URL
1994 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1995 if (mobj is None) or (mobj.group(1) is None):
1996 # Try to figure out reason of the error.
1997 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1998 if (mobj is not None) and (mobj.group(1) is not None):
1999 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2000 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2002 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2005 file_url = mobj.group(1)
2006 file_extension = os.path.splitext(file_url)[1][1:]
2008 # Search for file title
2009 mobj = re.search(r'<b title="(.*?)">', webpage)
2011 self._downloader.trouble(u'ERROR: unable to extract title')
2013 file_title = mobj.group(1).decode('utf-8')
2016 'id': file_id.decode('utf-8'),
2017 'url': file_url.decode('utf-8'),
2019 'upload_date': None,
2020 'title': file_title,
2021 'ext': file_extension.decode('utf-8'),
2025 class FacebookIE(InfoExtractor):
2026 """Information Extractor for Facebook"""
2028 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2029 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2030 _NETRC_MACHINE = 'facebook'
2031 IE_NAME = u'facebook'
2033 def report_login(self):
2034 """Report attempt to log in."""
2035 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2037 def _real_initialize(self):
2038 if self._downloader is None:
2043 downloader_params = self._downloader.params
2045 # Attempt to use provided username and password or .netrc data
2046 if downloader_params.get('username', None) is not None:
2047 useremail = downloader_params['username']
2048 password = downloader_params['password']
2049 elif downloader_params.get('usenetrc', False):
2051 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2052 if info is not None:
2056 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2057 except (IOError, netrc.NetrcParseError) as err:
2058 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2061 if useremail is None:
2070 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2073 login_results = compat_urllib_request.urlopen(request).read()
2074 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2077 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2078 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2081 def _real_extract(self, url):
2082 mobj = re.match(self._VALID_URL, url)
2084 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2086 video_id = mobj.group('ID')
2088 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2089 webpage = self._download_webpage(url, video_id)
2091 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2092 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2093 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2095 raise ExtractorError(u'Cannot parse data')
2096 data = dict(json.loads(m.group(1)))
2097 params_raw = compat_urllib_parse.unquote(data['params'])
2098 params = json.loads(params_raw)
2099 video_url = params['hd_src']
2100 video_duration = int(params['video_duration'])
2102 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2104 raise ExtractorError(u'Cannot find title in webpage')
2105 video_title = unescapeHTML(m.group(1))
2109 'title': video_title,
2112 'duration': video_duration,
2113 'thumbnail': params['thumbnail_src'],
2118 class BlipTVIE(InfoExtractor):
2119 """Information extractor for blip.tv"""
2121 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2122 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2123 IE_NAME = u'blip.tv'
2125 def report_extraction(self, file_id):
2126 """Report information extraction."""
2127 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2129 def report_direct_download(self, title):
2130 """Report information extraction."""
2131 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2133 def _real_extract(self, url):
2134 mobj = re.match(self._VALID_URL, url)
2136 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2143 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2144 request = compat_urllib_request.Request(json_url)
2145 request.add_header('User-Agent', 'iTunes/10.6.1')
2146 self.report_extraction(mobj.group(1))
2149 urlh = compat_urllib_request.urlopen(request)
2150 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2151 basename = url.split('/')[-1]
2152 title,ext = os.path.splitext(basename)
2153 title = title.decode('UTF-8')
2154 ext = ext.replace('.', '')
2155 self.report_direct_download(title)
2160 'upload_date': None,
2165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2166 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2167 if info is None: # Regular URL
2169 json_code_bytes = urlh.read()
2170 json_code = json_code_bytes.decode('utf-8')
2171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2176 json_data = json.loads(json_code)
2177 if 'Post' in json_data:
2178 data = json_data['Post']
2182 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2183 video_url = data['media']['url']
2184 umobj = re.match(self._URL_EXT, video_url)
2186 raise ValueError('Can not determine filename extension')
2187 ext = umobj.group(1)
2190 'id': data['item_id'],
2192 'uploader': data['display_name'],
2193 'upload_date': upload_date,
2194 'title': data['title'],
2196 'format': data['media']['mimeType'],
2197 'thumbnail': data['thumbnailUrl'],
2198 'description': data['description'],
2199 'player_url': data['embedUrl'],
2200 'user_agent': 'iTunes/10.6.1',
2202 except (ValueError,KeyError) as err:
2203 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2209 class MyVideoIE(InfoExtractor):
2210 """Information Extractor for myvideo.de."""
2212 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2213 IE_NAME = u'myvideo'
2215 def __init__(self, downloader=None):
2216 InfoExtractor.__init__(self, downloader)
2218 def report_extraction(self, video_id):
2219 """Report information extraction."""
2220 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2222 def _real_extract(self,url):
2223 mobj = re.match(self._VALID_URL, url)
2225 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2228 video_id = mobj.group(1)
2231 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2232 webpage = self._download_webpage(webpage_url, video_id)
2234 self.report_extraction(video_id)
2235 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2238 self._downloader.trouble(u'ERROR: unable to extract media URL')
2240 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2242 mobj = re.search('<title>([^<]+)</title>', webpage)
2244 self._downloader.trouble(u'ERROR: unable to extract title')
2247 video_title = mobj.group(1)
2253 'upload_date': None,
2254 'title': video_title,
2258 class ComedyCentralIE(InfoExtractor):
2259 """Information extractor for The Daily Show and Colbert Report """
2261 # urls can be abbreviations like :thedailyshow or :colbert
2262 # urls for episodes like:
2263 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2264 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2265 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2266 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2267 |(https?://)?(www\.)?
2268 (?P<showname>thedailyshow|colbertnation)\.com/
2269 (full-episodes/(?P<episode>.*)|
2271 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2272 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2275 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2277 _video_extensions = {
2285 _video_dimensions = {
2294 def suitable(self, url):
2295 """Receives a URL and returns True if suitable for this IE."""
2296 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2298 def report_extraction(self, episode_id):
2299 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2301 def report_config_download(self, episode_id, media_id):
2302 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2304 def report_index_download(self, episode_id):
2305 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2307 def _print_formats(self, formats):
2308 print('Available formats:')
2310 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2313 def _real_extract(self, url):
2314 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2316 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2319 if mobj.group('shortname'):
2320 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2321 url = u'http://www.thedailyshow.com/full-episodes/'
2323 url = u'http://www.colbertnation.com/full-episodes/'
2324 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2325 assert mobj is not None
2327 if mobj.group('clip'):
2328 if mobj.group('showname') == 'thedailyshow':
2329 epTitle = mobj.group('tdstitle')
2331 epTitle = mobj.group('cntitle')
2334 dlNewest = not mobj.group('episode')
2336 epTitle = mobj.group('showname')
2338 epTitle = mobj.group('episode')
2340 req = compat_urllib_request.Request(url)
2341 self.report_extraction(epTitle)
2343 htmlHandle = compat_urllib_request.urlopen(req)
2344 html = htmlHandle.read()
2345 webpage = html.decode('utf-8')
2346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2347 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2350 url = htmlHandle.geturl()
2351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2353 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2355 if mobj.group('episode') == '':
2356 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2358 epTitle = mobj.group('episode')
2360 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2362 if len(mMovieParams) == 0:
2363 # The Colbert Report embeds the information in a without
2364 # a URL prefix; so extract the alternate reference
2365 # and then add the URL prefix manually.
2367 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2368 if len(altMovieParams) == 0:
2369 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2372 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2374 uri = mMovieParams[0][1]
2375 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2376 self.report_index_download(epTitle)
2378 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2379 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2380 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2385 idoc = xml.etree.ElementTree.fromstring(indexXml)
2386 itemEls = idoc.findall('.//item')
2387 for partNum,itemEl in enumerate(itemEls):
2388 mediaId = itemEl.findall('./guid')[0].text
2389 shortMediaId = mediaId.split(':')[-1]
2390 showId = mediaId.split(':')[-2].replace('.com', '')
2391 officialTitle = itemEl.findall('./title')[0].text
2392 officialDate = itemEl.findall('./pubDate')[0].text
2394 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2395 compat_urllib_parse.urlencode({'uri': mediaId}))
2396 configReq = compat_urllib_request.Request(configUrl)
2397 self.report_config_download(epTitle, shortMediaId)
2399 configXml = compat_urllib_request.urlopen(configReq).read()
2400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2404 cdoc = xml.etree.ElementTree.fromstring(configXml)
2406 for rendition in cdoc.findall('.//rendition'):
2407 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2411 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2414 if self._downloader.params.get('listformats', None):
2415 self._print_formats([i[0] for i in turls])
2418 # For now, just pick the highest bitrate
2419 format,rtmp_video_url = turls[-1]
2421 # Get the format arg from the arg stream
2422 req_format = self._downloader.params.get('format', None)
2424 # Select format if we can find one
2427 format, rtmp_video_url = f, v
2430 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2432 raise ExtractorError(u'Cannot transform RTMP url')
2433 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2434 video_url = base + m.group('finalid')
2436 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2441 'upload_date': officialDate,
2446 'description': officialTitle,
2448 results.append(info)
2453 class EscapistIE(InfoExtractor):
2454 """Information extractor for The Escapist """
2456 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2457 IE_NAME = u'escapist'
2459 def report_extraction(self, showName):
2460 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2462 def report_config_download(self, showName):
2463 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2465 def _real_extract(self, url):
2466 mobj = re.match(self._VALID_URL, url)
2468 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2470 showName = mobj.group('showname')
2471 videoId = mobj.group('episode')
2473 self.report_extraction(showName)
2475 webPage = compat_urllib_request.urlopen(url)
2476 webPageBytes = webPage.read()
2477 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2478 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2483 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2484 description = unescapeHTML(descMatch.group(1))
2485 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2486 imgUrl = unescapeHTML(imgMatch.group(1))
2487 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2488 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2489 configUrlMatch = re.search('config=(.*)$', playerUrl)
2490 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2492 self.report_config_download(showName)
2494 configJSON = compat_urllib_request.urlopen(configUrl)
2495 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2496 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2501 # Technically, it's JavaScript, not JSON
2502 configJSON = configJSON.replace("'", '"')
2505 config = json.loads(configJSON)
2506 except (ValueError,) as err:
2507 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2510 playlist = config['playlist']
2511 videoUrl = playlist[1]['url']
2516 'uploader': showName,
2517 'upload_date': None,
2520 'thumbnail': imgUrl,
2521 'description': description,
2522 'player_url': playerUrl,
2527 class CollegeHumorIE(InfoExtractor):
2528 """Information extractor for collegehumor.com"""
2531 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2532 IE_NAME = u'collegehumor'
2534 def report_manifest(self, video_id):
2535 """Report information extraction."""
2536 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2538 def report_extraction(self, video_id):
2539 """Report information extraction."""
2540 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2542 def _real_extract(self, url):
2543 mobj = re.match(self._VALID_URL, url)
2545 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2547 video_id = mobj.group('videoid')
2552 'upload_date': None,
2555 self.report_extraction(video_id)
2556 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2558 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2563 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2565 videoNode = mdoc.findall('./video')[0]
2566 info['description'] = videoNode.findall('./description')[0].text
2567 info['title'] = videoNode.findall('./caption')[0].text
2568 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2569 manifest_url = videoNode.findall('./file')[0].text
2571 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2574 manifest_url += '?hdcore=2.10.3'
2575 self.report_manifest(video_id)
2577 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2579 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2582 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2584 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2585 node_id = media_node.attrib['url']
2586 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2587 except IndexError as err:
2588 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2591 url_pr = compat_urllib_parse_urlparse(manifest_url)
2592 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2599 class XVideosIE(InfoExtractor):
2600 """Information extractor for xvideos.com"""
2602 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2603 IE_NAME = u'xvideos'
2605 def report_extraction(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2609 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2614 video_id = mobj.group(1)
2616 webpage = self._download_webpage(url, video_id)
2618 self.report_extraction(video_id)
2622 mobj = re.search(r'flv_url=(.+?)&', webpage)
2624 self._downloader.trouble(u'ERROR: unable to extract video url')
2626 video_url = compat_urllib_parse.unquote(mobj.group(1))
2630 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2632 self._downloader.trouble(u'ERROR: unable to extract video title')
2634 video_title = mobj.group(1)
2637 # Extract video thumbnail
2638 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2640 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2642 video_thumbnail = mobj.group(0)
2648 'upload_date': None,
2649 'title': video_title,
2651 'thumbnail': video_thumbnail,
2652 'description': None,
2658 class SoundcloudIE(InfoExtractor):
2659 """Information extractor for soundcloud.com
2660 To access the media, the uid of the song and a stream token
2661 must be extracted from the page source and the script must make
2662 a request to media.soundcloud.com/crossdomain.xml. Then
2663 the media can be grabbed by requesting from an url composed
2664 of the stream token and uid
2667 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2668 IE_NAME = u'soundcloud'
2670 def __init__(self, downloader=None):
2671 InfoExtractor.__init__(self, downloader)
2673 def report_resolve(self, video_id):
2674 """Report information extraction."""
2675 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2677 def report_extraction(self, video_id):
2678 """Report information extraction."""
2679 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2681 def _real_extract(self, url):
2682 mobj = re.match(self._VALID_URL, url)
2684 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2687 # extract uploader (which is in the url)
2688 uploader = mobj.group(1)
2689 # extract simple title (uploader + slug of song title)
2690 slug_title = mobj.group(2)
2691 simple_title = uploader + u'-' + slug_title
2693 self.report_resolve('%s/%s' % (uploader, slug_title))
2695 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2696 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2697 request = compat_urllib_request.Request(resolv_url)
2699 info_json_bytes = compat_urllib_request.urlopen(request).read()
2700 info_json = info_json_bytes.decode('utf-8')
2701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2702 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2705 info = json.loads(info_json)
2706 video_id = info['id']
2707 self.report_extraction('%s/%s' % (uploader, slug_title))
2709 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2710 request = compat_urllib_request.Request(streams_url)
2712 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2713 stream_json = stream_json_bytes.decode('utf-8')
2714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2718 streams = json.loads(stream_json)
2719 mediaURL = streams['http_mp3_128_url']
2724 'uploader': info['user']['username'],
2725 'upload_date': info['created_at'],
2726 'title': info['title'],
2728 'description': info['description'],
2732 class InfoQIE(InfoExtractor):
2733 """Information extractor for infoq.com"""
2734 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2736 def report_extraction(self, video_id):
2737 """Report information extraction."""
2738 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2740 def _real_extract(self, url):
2741 mobj = re.match(self._VALID_URL, url)
2743 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2746 webpage = self._download_webpage(url, video_id=url)
2747 self.report_extraction(url)
2750 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2752 self._downloader.trouble(u'ERROR: unable to extract video url')
2754 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2755 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2758 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2760 self._downloader.trouble(u'ERROR: unable to extract video title')
2762 video_title = mobj.group(1)
2764 # Extract description
2765 video_description = u'No description available.'
2766 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2767 if mobj is not None:
2768 video_description = mobj.group(1)
2770 video_filename = video_url.split('/')[-1]
2771 video_id, extension = video_filename.split('.')
2777 'upload_date': None,
2778 'title': video_title,
2779 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2781 'description': video_description,
2786 class MixcloudIE(InfoExtractor):
2787 """Information extractor for www.mixcloud.com"""
2789 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2790 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2791 IE_NAME = u'mixcloud'
2793 def __init__(self, downloader=None):
2794 InfoExtractor.__init__(self, downloader)
2796 def report_download_json(self, file_id):
2797 """Report JSON download."""
2798 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2800 def report_extraction(self, file_id):
2801 """Report information extraction."""
2802 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2804 def get_urls(self, jsonData, fmt, bitrate='best'):
2805 """Get urls from 'audio_formats' section in json"""
2808 bitrate_list = jsonData[fmt]
2809 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2810 bitrate = max(bitrate_list) # select highest
2812 url_list = jsonData[fmt][bitrate]
2813 except TypeError: # we have no bitrate info.
2814 url_list = jsonData[fmt]
2817 def check_urls(self, url_list):
2818 """Returns 1st active url from list"""
2819 for url in url_list:
2821 compat_urllib_request.urlopen(url)
2823 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2828 def _print_formats(self, formats):
2829 print('Available formats:')
2830 for fmt in formats.keys():
2831 for b in formats[fmt]:
2833 ext = formats[fmt][b][0]
2834 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2835 except TypeError: # we have no bitrate info
2836 ext = formats[fmt][0]
2837 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2840 def _real_extract(self, url):
2841 mobj = re.match(self._VALID_URL, url)
2843 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2845 # extract uploader & filename from url
2846 uploader = mobj.group(1).decode('utf-8')
2847 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2849 # construct API request
2850 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2851 # retrieve .json file with links to files
2852 request = compat_urllib_request.Request(file_url)
2854 self.report_download_json(file_url)
2855 jsonData = compat_urllib_request.urlopen(request).read()
2856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2857 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2861 json_data = json.loads(jsonData)
2862 player_url = json_data['player_swf_url']
2863 formats = dict(json_data['audio_formats'])
2865 req_format = self._downloader.params.get('format', None)
2868 if self._downloader.params.get('listformats', None):
2869 self._print_formats(formats)
2872 if req_format is None or req_format == 'best':
2873 for format_param in formats.keys():
2874 url_list = self.get_urls(formats, format_param)
2876 file_url = self.check_urls(url_list)
2877 if file_url is not None:
2880 if req_format not in formats:
2881 self._downloader.trouble(u'ERROR: format is not available')
2884 url_list = self.get_urls(formats, req_format)
2885 file_url = self.check_urls(url_list)
2886 format_param = req_format
2889 'id': file_id.decode('utf-8'),
2890 'url': file_url.decode('utf-8'),
2891 'uploader': uploader.decode('utf-8'),
2892 'upload_date': None,
2893 'title': json_data['name'],
2894 'ext': file_url.split('.')[-1].decode('utf-8'),
2895 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2896 'thumbnail': json_data['thumbnail_url'],
2897 'description': json_data['description'],
2898 'player_url': player_url.decode('utf-8'),
2901 class StanfordOpenClassroomIE(InfoExtractor):
2902 """Information extractor for Stanford's Open ClassRoom"""
2904 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2905 IE_NAME = u'stanfordoc'
2907 def report_download_webpage(self, objid):
2908 """Report information extraction."""
2909 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2911 def report_extraction(self, video_id):
2912 """Report information extraction."""
2913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2915 def _real_extract(self, url):
2916 mobj = re.match(self._VALID_URL, url)
2918 raise ExtractorError(u'Invalid URL: %s' % url)
2920 if mobj.group('course') and mobj.group('video'): # A specific video
2921 course = mobj.group('course')
2922 video = mobj.group('video')
2924 'id': course + '_' + video,
2926 'upload_date': None,
2929 self.report_extraction(info['id'])
2930 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2931 xmlUrl = baseUrl + video + '.xml'
2933 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2934 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2935 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2937 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2939 info['title'] = mdoc.findall('./title')[0].text
2940 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2942 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2944 info['ext'] = info['url'].rpartition('.')[2]
2946 elif mobj.group('course'): # A course page
2947 course = mobj.group('course')
2952 'upload_date': None,
2955 coursepage = self._download_webpage(url, info['id'],
2956 note='Downloading course info page',
2957 errnote='Unable to download course info page')
2959 m = re.search('<h1>([^<]+)</h1>', coursepage)
2961 info['title'] = unescapeHTML(m.group(1))
2963 info['title'] = info['id']
2965 m = re.search('<description>([^<]+)</description>', coursepage)
2967 info['description'] = unescapeHTML(m.group(1))
2969 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2972 'type': 'reference',
2973 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2977 for entry in info['list']:
2978 assert entry['type'] == 'reference'
2979 results += self.extract(entry['url'])
2983 'id': 'Stanford OpenClassroom',
2986 'upload_date': None,
2989 self.report_download_webpage(info['id'])
2990 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2992 rootpage = compat_urllib_request.urlopen(rootURL).read()
2993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2994 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2997 info['title'] = info['id']
2999 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3002 'type': 'reference',
3003 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3008 for entry in info['list']:
3009 assert entry['type'] == 'reference'
3010 results += self.extract(entry['url'])
3013 class MTVIE(InfoExtractor):
3014 """Information extractor for MTV.com"""
3016 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3019 def report_extraction(self, video_id):
3020 """Report information extraction."""
3021 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3023 def _real_extract(self, url):
3024 mobj = re.match(self._VALID_URL, url)
3026 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3028 if not mobj.group('proto'):
3029 url = 'http://' + url
3030 video_id = mobj.group('videoid')
3032 webpage = self._download_webpage(url, video_id)
3034 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3036 self._downloader.trouble(u'ERROR: unable to extract song name')
3038 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3039 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3041 self._downloader.trouble(u'ERROR: unable to extract performer')
3043 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3044 video_title = performer + ' - ' + song_name
3046 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3048 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3050 mtvn_uri = mobj.group(1)
3052 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3054 self._downloader.trouble(u'ERROR: unable to extract content id')
3056 content_id = mobj.group(1)
3058 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3059 self.report_extraction(video_id)
3060 request = compat_urllib_request.Request(videogen_url)
3062 metadataXml = compat_urllib_request.urlopen(request).read()
3063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3064 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3067 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3068 renditions = mdoc.findall('.//rendition')
3070 # For now, always pick the highest quality.
3071 rendition = renditions[-1]
3074 _,_,ext = rendition.attrib['type'].partition('/')
3075 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3076 video_url = rendition.find('./src').text
3078 self._downloader.trouble('Invalid rendition field.')
3084 'uploader': performer,
3085 'upload_date': None,
3086 'title': video_title,
3094 class YoukuIE(InfoExtractor):
3095 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3097 def report_download_webpage(self, file_id):
3098 """Report webpage download."""
3099 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3101 def report_extraction(self, file_id):
3102 """Report information extraction."""
3103 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3106 nowTime = int(time.time() * 1000)
3107 random1 = random.randint(1000,1998)
3108 random2 = random.randint(1000,9999)
3110 return "%d%d%d" %(nowTime,random1,random2)
3112 def _get_file_ID_mix_string(self, seed):
3114 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3116 for i in range(len(source)):
3117 seed = (seed * 211 + 30031 ) % 65536
3118 index = math.floor(seed / 65536 * len(source) )
3119 mixed.append(source[int(index)])
3120 source.remove(source[int(index)])
3121 #return ''.join(mixed)
3124 def _get_file_id(self, fileId, seed):
3125 mixed = self._get_file_ID_mix_string(seed)
3126 ids = fileId.split('*')
3130 realId.append(mixed[int(ch)])
3131 return ''.join(realId)
3133 def _real_extract(self, url):
3134 mobj = re.match(self._VALID_URL, url)
3136 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3138 video_id = mobj.group('ID')
3140 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3142 request = compat_urllib_request.Request(info_url, None, std_headers)
3144 self.report_download_webpage(video_id)
3145 jsondata = compat_urllib_request.urlopen(request).read()
3146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3150 self.report_extraction(video_id)
3152 jsonstr = jsondata.decode('utf-8')
3153 config = json.loads(jsonstr)
3155 video_title = config['data'][0]['title']
3156 seed = config['data'][0]['seed']
3158 format = self._downloader.params.get('format', None)
3159 supported_format = list(config['data'][0]['streamfileids'].keys())
3161 if format is None or format == 'best':
3162 if 'hd2' in supported_format:
3167 elif format == 'worst':
3175 fileid = config['data'][0]['streamfileids'][format]
3176 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3177 except (UnicodeDecodeError, ValueError, KeyError):
3178 self._downloader.trouble(u'ERROR: unable to extract info section')
3182 sid = self._gen_sid()
3183 fileid = self._get_file_id(fileid, seed)
3185 #column 8,9 of fileid represent the segment number
3186 #fileid[7:9] should be changed
3187 for index, key in enumerate(keys):
3189 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3190 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3193 'id': '%s_part%02d' % (video_id, index),
3194 'url': download_url,
3196 'upload_date': None,
3197 'title': video_title,
3200 files_info.append(info)
3205 class XNXXIE(InfoExtractor):
3206 """Information extractor for xnxx.com"""
3208 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3210 VIDEO_URL_RE = r'flv_url=(.*?)&'
3211 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3212 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3214 def report_webpage(self, video_id):
3215 """Report information extraction"""
3216 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3218 def report_extraction(self, video_id):
3219 """Report information extraction"""
3220 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3222 def _real_extract(self, url):
3223 mobj = re.match(self._VALID_URL, url)
3225 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3227 video_id = mobj.group(1)
3229 self.report_webpage(video_id)
3231 # Get webpage content
3233 webpage_bytes = compat_urllib_request.urlopen(url).read()
3234 webpage = webpage_bytes.decode('utf-8')
3235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3236 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3239 result = re.search(self.VIDEO_URL_RE, webpage)
3241 self._downloader.trouble(u'ERROR: unable to extract video url')
3243 video_url = compat_urllib_parse.unquote(result.group(1))
3245 result = re.search(self.VIDEO_TITLE_RE, webpage)
3247 self._downloader.trouble(u'ERROR: unable to extract video title')
3249 video_title = result.group(1)
3251 result = re.search(self.VIDEO_THUMB_RE, webpage)
3253 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3255 video_thumbnail = result.group(1)
3261 'upload_date': None,
3262 'title': video_title,
3264 'thumbnail': video_thumbnail,
3265 'description': None,
3269 class GooglePlusIE(InfoExtractor):
3270 """Information extractor for plus.google.com."""
3272 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3273 IE_NAME = u'plus.google'
3275 def __init__(self, downloader=None):
3276 InfoExtractor.__init__(self, downloader)
3278 def report_extract_entry(self, url):
3279 """Report downloading extry"""
3280 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3282 def report_date(self, upload_date):
3283 """Report downloading extry"""
3284 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3286 def report_uploader(self, uploader):
3287 """Report downloading extry"""
3288 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3290 def report_title(self, video_title):
3291 """Report downloading extry"""
3292 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3294 def report_extract_vid_page(self, video_page):
3295 """Report information extraction."""
3296 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3298 def _real_extract(self, url):
3299 # Extract id from URL
3300 mobj = re.match(self._VALID_URL, url)
3302 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3305 post_url = mobj.group(0)
3306 video_id = mobj.group(1)
3308 video_extension = 'flv'
3310 # Step 1, Retrieve post webpage to extract further information
3311 self.report_extract_entry(post_url)
3312 request = compat_urllib_request.Request(post_url)
3314 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3316 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3319 # Extract update date
3321 pattern = 'title="Timestamp">(.*?)</a>'
3322 mobj = re.search(pattern, webpage)
3324 upload_date = mobj.group(1)
3325 # Convert timestring to a format suitable for filename
3326 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3327 upload_date = upload_date.strftime('%Y%m%d')
3328 self.report_date(upload_date)
3332 pattern = r'rel\="author".*?>(.*?)</a>'
3333 mobj = re.search(pattern, webpage)
3335 uploader = mobj.group(1)
3336 self.report_uploader(uploader)
3339 # Get the first line for title
3341 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3342 mobj = re.search(pattern, webpage)
3344 video_title = mobj.group(1)
3345 self.report_title(video_title)
3347 # Step 2, Stimulate clicking the image box to launch video
3348 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3349 mobj = re.search(pattern, webpage)
3351 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3353 video_page = mobj.group(1)
3354 request = compat_urllib_request.Request(video_page)
3356 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3357 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3358 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3360 self.report_extract_vid_page(video_page)
3363 # Extract video links on video page
3364 """Extract video links of all sizes"""
3365 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3366 mobj = re.findall(pattern, webpage)
3368 self._downloader.trouble(u'ERROR: unable to extract video links')
3370 # Sort in resolution
3371 links = sorted(mobj)
3373 # Choose the lowest of the sort, i.e. highest resolution
3374 video_url = links[-1]
3375 # Only get the url. The resolution part in the tuple has no use anymore
3376 video_url = video_url[-1]
3377 # Treat escaped \u0026 style hex
3379 video_url = video_url.decode("unicode_escape")
3380 except AttributeError: # Python 3
3381 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3387 'uploader': uploader,
3388 'upload_date': upload_date,
3389 'title': video_title,
3390 'ext': video_extension,
3393 class NBAIE(InfoExtractor):
3394 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3397 def _real_extract(self, url):
3398 mobj = re.match(self._VALID_URL, url)
3400 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3403 video_id = mobj.group(1)
3404 if video_id.endswith('/index.html'):
3405 video_id = video_id[:-len('/index.html')]
3407 webpage = self._download_webpage(url, video_id)
3409 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3410 def _findProp(rexp, default=None):
3411 m = re.search(rexp, webpage)
3413 return unescapeHTML(m.group(1))
3417 shortened_video_id = video_id.rpartition('/')[2]
3418 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3420 'id': shortened_video_id,
3424 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3425 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3429 class JustinTVIE(InfoExtractor):
3430 """Information extractor for justin.tv and twitch.tv"""
3431 # TODO: One broadcast may be split into multiple videos. The key
3432 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3433 # starts at 1 and increases. Can we treat all parts as one video?
3435 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3436 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3437 _JUSTIN_PAGE_LIMIT = 100
3438 IE_NAME = u'justin.tv'
3440 def report_extraction(self, file_id):
3441 """Report information extraction."""
3442 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3444 def report_download_page(self, channel, offset):
3445 """Report attempt to download a single page of videos."""
3446 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3447 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3449 # Return count of items, list of *valid* items
3450 def _parse_page(self, url):
3452 urlh = compat_urllib_request.urlopen(url)
3453 webpage_bytes = urlh.read()
3454 webpage = webpage_bytes.decode('utf-8', 'ignore')
3455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3456 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3459 response = json.loads(webpage)
3460 if type(response) != list:
3461 error_text = response.get('error', 'unknown error')
3462 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3465 for clip in response:
3466 video_url = clip['video_file_url']
3468 video_extension = os.path.splitext(video_url)[1][1:]
3469 video_date = re.sub('-', '', clip['start_time'][:10])
3470 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3471 video_id = clip['id']
3472 video_title = clip.get('title', video_id)
3476 'title': video_title,
3477 'uploader': clip.get('channel_name', video_uploader_id),
3478 'uploader_id': video_uploader_id,
3479 'upload_date': video_date,
3480 'ext': video_extension,
3482 return (len(response), info)
3484 def _real_extract(self, url):
3485 mobj = re.match(self._VALID_URL, url)
3487 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3490 api = 'http://api.justin.tv'
3491 video_id = mobj.group(mobj.lastindex)
3493 if mobj.lastindex == 1:
3495 api += '/channel/archives/%s.json'
3497 api += '/broadcast/by_archive/%s.json'
3498 api = api % (video_id,)
3500 self.report_extraction(video_id)
3504 limit = self._JUSTIN_PAGE_LIMIT
3507 self.report_download_page(video_id, offset)
3508 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3509 page_count, page_info = self._parse_page(page_url)
3510 info.extend(page_info)
3511 if not paged or page_count != limit:
3516 class FunnyOrDieIE(InfoExtractor):
3517 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3519 def _real_extract(self, url):
3520 mobj = re.match(self._VALID_URL, url)
3522 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3525 video_id = mobj.group('id')
3526 webpage = self._download_webpage(url, video_id)
3528 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3530 self._downloader.trouble(u'ERROR: unable to find video information')
3531 video_url = unescapeHTML(m.group('url'))
3533 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3535 self._downloader.trouble(u'Cannot find video title')
3536 title = unescapeHTML(m.group('title'))
3538 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3540 desc = unescapeHTML(m.group('desc'))
3549 'description': desc,
3553 class TweetReelIE(InfoExtractor):
3554 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3556 def _real_extract(self, url):
3557 mobj = re.match(self._VALID_URL, url)
3559 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3562 video_id = mobj.group('id')
3563 webpage = self._download_webpage(url, video_id)
3565 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3567 self._downloader.trouble(u'ERROR: Cannot find status ID')
3568 status_id = m.group(1)
3570 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3572 self._downloader.trouble(u'WARNING: Cannot find description')
3573 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3575 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3577 self._downloader.trouble(u'ERROR: Cannot find uploader')
3578 uploader = unescapeHTML(m.group('uploader'))
3579 uploader_id = unescapeHTML(m.group('uploader_id'))
3581 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3583 self._downloader.trouble(u'ERROR: Cannot find upload date')
3584 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3587 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3594 'description': desc,
3595 'uploader': uploader,
3596 'uploader_id': uploader_id,
3597 'internal_id': status_id,
3598 'upload_date': upload_date
3602 class SteamIE(InfoExtractor):
3603 _VALID_URL = r"""http://store.steampowered.com/
3604 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3606 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3609 def suitable(self, url):
3610 """Receives a URL and returns True if suitable for this IE."""
3611 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3613 def _real_extract(self, url):
3614 m = re.match(self._VALID_URL, url, re.VERBOSE)
3615 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3616 gameID = m.group('gameID')
3617 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3618 webpage = self._download_webpage(videourl, gameID)
3619 mweb = re.finditer(urlRE, webpage)
3620 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3621 titles = re.finditer(namesRE, webpage)
3623 for vid,vtitle in zip(mweb,titles):
3624 video_id = vid.group('videoID')
3625 title = vtitle.group('videoName')
3626 video_url = vid.group('videoURL')
3628 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3633 'title': unescapeHTML(title)
3638 class UstreamIE(InfoExtractor):
3639 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3640 IE_NAME = u'ustream'
3642 def _real_extract(self, url):
3643 m = re.match(self._VALID_URL, url)
3644 video_id = m.group('videoID')
3645 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3646 webpage = self._download_webpage(url, video_id)
3647 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3648 title = m.group('title')
3649 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3650 uploader = m.group('uploader')
3656 'uploader': uploader
3660 class RBMARadioIE(InfoExtractor):
3661 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3663 def _real_extract(self, url):
3664 m = re.match(self._VALID_URL, url)
3665 video_id = m.group('videoID')
3667 webpage = self._download_webpage(url, video_id)
3668 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3670 raise ExtractorError(u'Cannot find metadata')
3671 json_data = m.group(1)
3674 data = json.loads(json_data)
3675 except ValueError as e:
3676 raise ExtractorError(u'Invalid JSON: ' + str(e))
3678 video_url = data['akamai_url'] + '&cbr=256'
3679 url_parts = compat_urllib_parse_urlparse(video_url)
3680 video_ext = url_parts.path.rpartition('.')[2]
3685 'title': data['title'],
3686 'description': data.get('teaser_text'),
3687 'location': data.get('country_of_origin'),
3688 'uploader': data.get('host', {}).get('name'),
3689 'uploader_id': data.get('host', {}).get('slug'),
3690 'thumbnail': data.get('image', {}).get('large_url_2x'),
3691 'duration': data.get('duration'),
3696 class YouPornIE(InfoExtractor):
3697 """Information extractor for youporn.com."""
3698 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3700 def _print_formats(self, formats):
3701 """Print all available formats"""
3702 print(u'Available formats:')
3703 print(u'ext\t\tformat')
3704 print(u'---------------------------------')
3705 for format in formats:
3706 print(u'%s\t\t%s' % (format['ext'], format['format']))
3708 def _specific(self, req_format, formats):
3710 if(x["format"]==req_format):
3714 def _real_extract(self, url):
3715 mobj = re.match(self._VALID_URL, url)
3717 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3720 video_id = mobj.group('videoid')
3722 req = compat_urllib_request.Request(url)
3723 req.add_header('Cookie', 'age_verified=1')
3724 webpage = self._download_webpage(req, video_id)
3726 # Get the video title
3727 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3729 raise ExtractorError(u'ERROR: unable to extract video title')
3730 video_title = result.group('title').strip()
3732 # Get the video date
3733 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3735 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3738 upload_date = result.group('date').strip()
3740 # Get the video uploader
3741 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3743 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3744 video_uploader = None
3746 video_uploader = result.group('uploader').strip()
3747 video_uploader = clean_html( video_uploader )
3749 # Get all of the formats available
3750 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3751 result = re.search(DOWNLOAD_LIST_RE, webpage)
3753 raise ExtractorError(u'Unable to extract download list')
3754 download_list_html = result.group('download_list').strip()
3756 # Get all of the links from the page
3757 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3758 links = re.findall(LINK_RE, download_list_html)
3759 if(len(links) == 0):
3760 raise ExtractorError(u'ERROR: no known formats available for video')
3762 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3767 # A link looks like this:
3768 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3769 # A path looks like this:
3770 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3771 video_url = unescapeHTML( link )
3772 path = compat_urllib_parse_urlparse( video_url ).path
3773 extension = os.path.splitext( path )[1][1:]
3774 format = path.split('/')[4].split('_')[:2]
3777 format = "-".join( format )
3778 title = u'%s-%s-%s' % (video_title, size, bitrate)
3783 'uploader': video_uploader,
3784 'upload_date': upload_date,
3789 'description': None,
3793 if self._downloader.params.get('listformats', None):
3794 self._print_formats(formats)
3797 req_format = self._downloader.params.get('format', None)
3798 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3800 if req_format is None or req_format == 'best':
3802 elif req_format == 'worst':
3803 return [formats[-1]]
3804 elif req_format in ('-1', 'all'):
3807 format = self._specific( req_format, formats )
3809 self._downloader.trouble(u'ERROR: requested format not available')
3815 class PornotubeIE(InfoExtractor):
3816 """Information extractor for pornotube.com."""
3817 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3819 def _real_extract(self, url):
3820 mobj = re.match(self._VALID_URL, url)
3822 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3825 video_id = mobj.group('videoid')
3826 video_title = mobj.group('title')
3828 # Get webpage content
3829 webpage = self._download_webpage(url, video_id)
3832 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3833 result = re.search(VIDEO_URL_RE, webpage)
3835 self._downloader.trouble(u'ERROR: unable to extract video url')
3837 video_url = compat_urllib_parse.unquote(result.group('url'))
3839 #Get the uploaded date
3840 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3841 result = re.search(VIDEO_UPLOADED_RE, webpage)
3843 self._downloader.trouble(u'ERROR: unable to extract video title')
3845 upload_date = result.group('date')
3847 info = {'id': video_id,
3850 'upload_date': upload_date,
3851 'title': video_title,
3857 class YouJizzIE(InfoExtractor):
3858 """Information extractor for youjizz.com."""
3859 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3861 def _real_extract(self, url):
3862 mobj = re.match(self._VALID_URL, url)
3864 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3867 video_id = mobj.group('videoid')
3869 # Get webpage content
3870 webpage = self._download_webpage(url, video_id)
3872 # Get the video title
3873 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3875 raise ExtractorError(u'ERROR: unable to extract video title')
3876 video_title = result.group('title').strip()
3878 # Get the embed page
3879 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3881 raise ExtractorError(u'ERROR: unable to extract embed page')
3883 embed_page_url = result.group(0).strip()
3884 video_id = result.group('videoid')
3886 webpage = self._download_webpage(embed_page_url, video_id)
3889 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3891 raise ExtractorError(u'ERROR: unable to extract video url')
3892 video_url = result.group('source')
3894 info = {'id': video_id,
3896 'title': video_title,
3899 'player_url': embed_page_url}
3903 class EightTracksIE(InfoExtractor):
3905 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3907 def _real_extract(self, url):
3908 mobj = re.match(self._VALID_URL, url)
3910 raise ExtractorError(u'Invalid URL: %s' % url)
3911 playlist_id = mobj.group('id')
3913 webpage = self._download_webpage(url, playlist_id)
3915 m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3917 raise ExtractorError(u'Cannot find trax information')
3918 json_like = m.group(1)
3919 data = json.loads(json_like)
3921 session = str(random.randint(0, 1000000000))
3923 track_count = data['tracks_count']
3924 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3925 next_url = first_url
3927 for i in itertools.count():
3928 api_json = self._download_webpage(next_url, playlist_id,
3929 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3930 errnote=u'Failed to download song information')
3931 api_data = json.loads(api_json)
3932 track_data = api_data[u'set']['track']
3934 'id': track_data['id'],
3935 'url': track_data['track_file_stream_url'],
3936 'title': track_data['performer'] + u' - ' + track_data['name'],
3937 'raw_title': track_data['name'],
3938 'uploader_id': data['user']['login'],
3942 if api_data['set']['at_last_track']:
3944 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3947 class KeekIE(InfoExtractor):
3948 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3951 def _real_extract(self, url):
3952 m = re.match(self._VALID_URL, url)
3953 video_id = m.group('videoID')
3954 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3955 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3956 webpage = self._download_webpage(url, video_id)
3957 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3958 title = unescapeHTML(m.group('title'))
3959 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3960 uploader = unescapeHTML(m.group('uploader'))
3966 'thumbnail': thumbnail,
3967 'uploader': uploader
3971 class MyspassIE(InfoExtractor):
3972 _VALID_URL = r'http://www.myspass.de/.*'
3973 IE_NAME = u'myspass'
3975 def _real_extract(self, url):
3976 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3978 # video id is the last path element of the URL
3979 # usually there is a trailing slash, so also try the second but last
3980 url_path = compat_urllib_parse_urlparse(url).path
3981 url_parent_path, video_id = os.path.split(url_path)
3983 _, video_id = os.path.split(url_parent_path)
3986 metadata_url = META_DATA_URL_TEMPLATE % video_id
3987 metadata_text = self._download_webpage(metadata_url, video_id)
3988 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3990 # extract values from metadata
3991 url_flv_el = metadata.find('url_flv')
3992 if url_flv_el is None:
3993 self._downloader.trouble(u'ERROR: unable to extract download url')
3995 video_url = url_flv_el.text
3996 extension = os.path.splitext(video_url)[1][1:]
3997 title_el = metadata.find('title')
3998 if title_el is None:
3999 self._downloader.trouble(u'ERROR: unable to extract title')
4001 title = title_el.text
4002 format_id_el = metadata.find('format_id')
4003 if format_id_el is None:
4006 format = format_id_el.text
4007 description_el = metadata.find('description')
4008 if description_el is not None:
4009 description = description_el.text
4012 imagePreview_el = metadata.find('imagePreview')
4013 if imagePreview_el is not None:
4014 thumbnail = imagePreview_el.text
4023 'thumbnail': thumbnail,
4024 'description': description
4028 def gen_extractors():
4029 """ Return a list of an instance of every supported extractor.
4030 The order does matter; the first extractor matched is the one handling the URL.
4033 YoutubePlaylistIE(),
4057 StanfordOpenClassroomIE(),