2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
39 The following fields are optional:
41 format: The video format, defaults to ext (used for --get-format)
42 thumbnail: Full URL to a video thumbnail image.
43 description: One-line video description.
44 uploader: Full name of the video uploader.
45 upload_date: Video upload date (YYYYMMDD).
46 uploader_id: Nickname or id of the video uploader.
47 location: Physical location of the video.
48 player_url: SWF Player URL (used for rtmpdump).
49 subtitles: The .srt file contents.
50 urlhandle: [internal] The urlHandle to be used to download the file,
51 like returned by urllib.request.urlopen
53 The fields should all be Unicode strings.
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
59 _real_extract() must return a *list* of information dictionaries as
62 Finally, the _WORKING attribute should be set to False for broken IEs
63 in order to warn the users and skip the tests.
70 def __init__(self, downloader=None):
71 """Constructor. Receives an optional downloader."""
73 self.set_downloader(downloader)
75 def suitable(self, url):
76 """Receives a URL and returns True if suitable for this IE."""
77 return re.match(self._VALID_URL, url) is not None
80 """Getter method for _WORKING."""
84 """Initializes an instance (authentication, etc)."""
86 self._real_initialize()
89 def extract(self, url):
90 """Extracts URL information and returns it in list of dicts."""
92 return self._real_extract(url)
94 def set_downloader(self, downloader):
95 """Sets the downloader for this IE."""
96 self._downloader = downloader
98 def _real_initialize(self):
99 """Real initialization process. Redefine in subclasses."""
102 def _real_extract(self, url):
103 """Real extraction process. Redefine in subclasses."""
108 return type(self).__name__[:-2]
110 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
111 """ Returns the response handle """
113 note = u'Downloading video webpage'
114 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116 return compat_urllib_request.urlopen(url_or_request)
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
123 """ Returns the data of the page as a string """
124 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
125 webpage_bytes = urlh.read()
126 return webpage_bytes.decode('utf-8', 'replace')
129 class YoutubeIE(InfoExtractor):
130 """Information extractor for youtube.com."""
134 (?:https?://)? # http(s):// (optional)
135 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
136 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
137 (?:.*?\#/)? # handle anchor (#/) redirect urls
138 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
139 (?: # the various things that can precede the ID:
140 (?:(?:v|embed|e)/) # v/ or embed/ or e/
141 |(?: # or the v= param in all its forms
142 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
143 (?:\?|\#!?) # the params delimiter ? or # or #!
144 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
147 )? # optional -> youtube.com/xxxx is OK
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
152 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
153 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
154 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
155 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
156 _NETRC_MACHINE = 'youtube'
157 # Listed in order of quality
158 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
159 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
160 _video_extensions = {
166 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
172 _video_dimensions = {
190 def suitable(self, url):
191 """Receives a URL and returns True if suitable for this IE."""
192 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194 def report_lang(self):
195 """Report attempt to set language."""
196 self._downloader.to_screen(u'[youtube] Setting language')
198 def report_login(self):
199 """Report attempt to log in."""
200 self._downloader.to_screen(u'[youtube] Logging in')
202 def report_age_confirmation(self):
203 """Report attempt to confirm age."""
204 self._downloader.to_screen(u'[youtube] Confirming age')
206 def report_video_webpage_download(self, video_id):
207 """Report attempt to download video webpage."""
208 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210 def report_video_info_webpage_download(self, video_id):
211 """Report attempt to download video info webpage."""
212 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214 def report_video_subtitles_download(self, video_id):
215 """Report attempt to download video info webpage."""
216 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218 def report_information_extraction(self, video_id):
219 """Report attempt to extract video information."""
220 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222 def report_unavailable_format(self, video_id, format):
223 """Report extracted video URL."""
224 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226 def report_rtmp_download(self):
227 """Indicate the download will use the RTMP protocol."""
228 self._downloader.to_screen(u'[youtube] RTMP download detected')
230 def _closed_captions_xml_to_srt(self, xml_string):
232 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
233 # TODO parse xml instead of regex
234 for n, (start, dur_tag, dur, caption) in enumerate(texts):
235 if not dur: dur = '4'
237 end = start + float(dur)
238 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
239 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
240 caption = unescapeHTML(caption)
241 caption = unescapeHTML(caption) # double cycle, intentional
242 srt += str(n+1) + '\n'
243 srt += start + ' --> ' + end + '\n'
244 srt += caption + '\n\n'
247 def _extract_subtitles(self, video_id):
248 self.report_video_subtitles_download(video_id)
249 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
252 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
253 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
254 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
255 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
256 if not srt_lang_list:
257 return (u'WARNING: video has no closed captions', None)
258 if self._downloader.params.get('subtitleslang', False):
259 srt_lang = self._downloader.params.get('subtitleslang')
260 elif 'en' in srt_lang_list:
263 srt_lang = list(srt_lang_list.keys())[0]
264 if not srt_lang in srt_lang_list:
265 return (u'WARNING: no closed captions found in the specified language', None)
266 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
268 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
269 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
270 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
272 return (u'WARNING: unable to download video subtitles', None)
273 return (None, self._closed_captions_xml_to_srt(srt_xml))
275 def _print_formats(self, formats):
276 print('Available formats:')
278 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
280 def _real_initialize(self):
281 if self._downloader is None:
286 downloader_params = self._downloader.params
288 # Attempt to use provided username and password or .netrc data
289 if downloader_params.get('username', None) is not None:
290 username = downloader_params['username']
291 password = downloader_params['password']
292 elif downloader_params.get('usenetrc', False):
294 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
299 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
300 except (IOError, netrc.NetrcParseError) as err:
301 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
305 request = compat_urllib_request.Request(self._LANG_URL)
308 compat_urllib_request.urlopen(request).read()
309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
313 # No authentication to be performed
319 'current_form': 'loginForm',
321 'action_login': 'Log In',
322 'username': username,
323 'password': password,
325 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
328 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
329 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
330 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
333 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
339 'action_confirm': 'Confirm',
341 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
343 self.report_age_confirmation()
344 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
345 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
346 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
349 def _extract_id(self, url):
350 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
352 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
354 video_id = mobj.group(2)
357 def _real_extract(self, url):
358 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
359 mobj = re.search(self._NEXT_URL_RE, url)
361 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
362 video_id = self._extract_id(url)
365 self.report_video_webpage_download(video_id)
366 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
367 request = compat_urllib_request.Request(url)
369 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
374 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
376 # Attempt to extract SWF player URL
377 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
379 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
384 self.report_video_info_webpage_download(video_id)
385 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
386 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
387 % (video_id, el_type))
388 request = compat_urllib_request.Request(video_info_url)
390 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
391 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
392 video_info = compat_parse_qs(video_info_webpage)
393 if 'token' in video_info:
395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
398 if 'token' not in video_info:
399 if 'reason' in video_info:
400 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
402 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
405 # Check for "rental" videos
406 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
407 self._downloader.trouble(u'ERROR: "rental" videos not supported')
410 # Start extracting information
411 self.report_information_extraction(video_id)
414 if 'author' not in video_info:
415 self._downloader.trouble(u'ERROR: unable to extract uploader name')
417 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
420 video_uploader_id = None
421 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
423 video_uploader_id = mobj.group(1)
425 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
428 if 'title' not in video_info:
429 self._downloader.trouble(u'ERROR: unable to extract video title')
431 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
434 if 'thumbnail_url' not in video_info:
435 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
437 else: # don't panic if we can't find it
438 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
442 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
444 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
445 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
446 for expression in format_expressions:
448 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
453 video_description = get_element_by_id("eow-description", video_webpage)
454 if video_description:
455 video_description = clean_html(video_description)
457 video_description = ''
460 video_subtitles = None
461 if self._downloader.params.get('writesubtitles', False):
462 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
464 self._downloader.trouble(srt_error)
466 if 'length_seconds' not in video_info:
467 self._downloader.trouble(u'WARNING: unable to extract video duration')
470 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
473 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
475 # Decide which formats to download
476 req_format = self._downloader.params.get('format', None)
478 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
479 self.report_rtmp_download()
480 video_url_list = [(None, video_info['conn'][0])]
481 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
482 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
483 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
484 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
485 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
487 format_limit = self._downloader.params.get('format_limit', None)
488 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
489 if format_limit is not None and format_limit in available_formats:
490 format_list = available_formats[available_formats.index(format_limit):]
492 format_list = available_formats
493 existing_formats = [x for x in format_list if x in url_map]
494 if len(existing_formats) == 0:
495 self._downloader.trouble(u'ERROR: no known formats available for video')
497 if self._downloader.params.get('listformats', None):
498 self._print_formats(existing_formats)
500 if req_format is None or req_format == 'best':
501 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
502 elif req_format == 'worst':
503 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
504 elif req_format in ('-1', 'all'):
505 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
507 # Specific formats. We pick the first in a slash-delimeted sequence.
508 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
509 req_formats = req_format.split('/')
510 video_url_list = None
511 for rf in req_formats:
513 video_url_list = [(rf, url_map[rf])]
515 if video_url_list is None:
516 self._downloader.trouble(u'ERROR: requested format not available')
519 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
523 for format_param, video_real_url in video_url_list:
525 video_extension = self._video_extensions.get(format_param, 'flv')
527 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
528 self._video_dimensions.get(format_param, '???'))
532 'url': video_real_url,
533 'uploader': video_uploader,
534 'uploader_id': video_uploader_id,
535 'upload_date': upload_date,
536 'title': video_title,
537 'ext': video_extension,
538 'format': video_format,
539 'thumbnail': video_thumbnail,
540 'description': video_description,
541 'player_url': player_url,
542 'subtitles': video_subtitles,
543 'duration': video_duration
548 class MetacafeIE(InfoExtractor):
549 """Information Extractor for metacafe.com."""
551 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
552 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
553 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
554 IE_NAME = u'metacafe'
556 def __init__(self, downloader=None):
557 InfoExtractor.__init__(self, downloader)
559 def report_disclaimer(self):
560 """Report disclaimer retrieval."""
561 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
563 def report_age_confirmation(self):
564 """Report attempt to confirm age."""
565 self._downloader.to_screen(u'[metacafe] Confirming age')
567 def report_download_webpage(self, video_id):
568 """Report webpage download."""
569 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
571 def report_extraction(self, video_id):
572 """Report information extraction."""
573 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
575 def _real_initialize(self):
576 # Retrieve disclaimer
577 request = compat_urllib_request.Request(self._DISCLAIMER)
579 self.report_disclaimer()
580 disclaimer = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
588 'submit': "Continue - I'm over 18",
590 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
592 self.report_age_confirmation()
593 disclaimer = compat_urllib_request.urlopen(request).read()
594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
598 def _real_extract(self, url):
599 # Extract id and simplified title from URL
600 mobj = re.match(self._VALID_URL, url)
602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
605 video_id = mobj.group(1)
607 # Check if video comes from YouTube
608 mobj2 = re.match(r'^yt-(.*)$', video_id)
609 if mobj2 is not None:
610 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
613 # Retrieve video webpage to extract further information
614 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
616 self.report_download_webpage(video_id)
617 webpage = compat_urllib_request.urlopen(request).read()
618 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
619 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
622 # Extract URL, uploader and title from webpage
623 self.report_extraction(video_id)
624 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
626 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
627 video_extension = mediaURL[-3:]
629 # Extract gdaKey if available
630 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
634 gdaKey = mobj.group(1)
635 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
637 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
639 self._downloader.trouble(u'ERROR: unable to extract media URL')
641 vardict = compat_parse_qs(mobj.group(1))
642 if 'mediaData' not in vardict:
643 self._downloader.trouble(u'ERROR: unable to extract media URL')
645 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
647 self._downloader.trouble(u'ERROR: unable to extract media URL')
649 mediaURL = mobj.group(1).replace('\\/', '/')
650 video_extension = mediaURL[-3:]
651 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
653 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract title')
657 video_title = mobj.group(1).decode('utf-8')
659 mobj = re.search(r'submitter=(.*?);', webpage)
661 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
663 video_uploader = mobj.group(1)
666 'id': video_id.decode('utf-8'),
667 'url': video_url.decode('utf-8'),
668 'uploader': video_uploader.decode('utf-8'),
670 'title': video_title,
671 'ext': video_extension.decode('utf-8'),
675 class DailymotionIE(InfoExtractor):
676 """Information Extractor for Dailymotion"""
678 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
679 IE_NAME = u'dailymotion'
681 def __init__(self, downloader=None):
682 InfoExtractor.__init__(self, downloader)
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
688 def _real_extract(self, url):
689 # Extract id and simplified title from URL
690 mobj = re.match(self._VALID_URL, url)
692 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
695 video_id = mobj.group(1).split('_')[0].split('?')[0]
697 video_extension = 'mp4'
699 # Retrieve video webpage to extract further information
700 request = compat_urllib_request.Request(url)
701 request.add_header('Cookie', 'family_filter=off')
702 webpage = self._download_webpage(request, video_id)
704 # Extract URL, uploader and title from webpage
705 self.report_extraction(video_id)
706 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
708 self._downloader.trouble(u'ERROR: unable to extract media URL')
710 flashvars = compat_urllib_parse.unquote(mobj.group(1))
712 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
715 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
718 self._downloader.trouble(u'ERROR: unable to extract video URL')
721 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
723 self._downloader.trouble(u'ERROR: unable to extract video URL')
726 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
728 # TODO: support choosing qualities
730 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
732 self._downloader.trouble(u'ERROR: unable to extract title')
734 video_title = unescapeHTML(mobj.group('title'))
736 video_uploader = None
737 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
739 # lookin for official user
740 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
741 if mobj_official is None:
742 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
744 video_uploader = mobj_official.group(1)
746 video_uploader = mobj.group(1)
748 video_upload_date = None
749 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
751 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
756 'uploader': video_uploader,
757 'upload_date': video_upload_date,
758 'title': video_title,
759 'ext': video_extension,
763 class PhotobucketIE(InfoExtractor):
764 """Information extractor for photobucket.com."""
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
787 video_id = mobj.group(1)
789 video_extension = 'flv'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
794 self.report_download_webpage(video_id)
795 webpage = compat_urllib_request.urlopen(request).read()
796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
806 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
812 self._downloader.trouble(u'ERROR: unable to extract title')
814 video_title = mobj.group(1).decode('utf-8')
816 video_uploader = mobj.group(2).decode('utf-8')
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
823 'title': video_title,
824 'ext': video_extension.decode('utf-8'),
828 class YahooIE(InfoExtractor):
829 """Information extractor for video.yahoo.com."""
832 # _VALID_URL matches all Yahoo! Video URLs
833 # _VPAGE_URL matches only the extractable '/watch/' URLs
834 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
835 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
836 IE_NAME = u'video.yahoo'
838 def __init__(self, downloader=None):
839 InfoExtractor.__init__(self, downloader)
841 def report_download_webpage(self, video_id):
842 """Report webpage download."""
843 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
845 def report_extraction(self, video_id):
846 """Report information extraction."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
849 def _real_extract(self, url, new_video=True):
850 # Extract ID from URL
851 mobj = re.match(self._VALID_URL, url)
853 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
856 video_id = mobj.group(2)
857 video_extension = 'flv'
859 # Rewrite valid but non-extractable URLs as
860 # extractable English language /watch/ URLs
861 if re.match(self._VPAGE_URL, url) is None:
862 request = compat_urllib_request.Request(url)
864 webpage = compat_urllib_request.urlopen(request).read()
865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
866 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
869 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
871 self._downloader.trouble(u'ERROR: Unable to extract id field')
873 yahoo_id = mobj.group(1)
875 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
877 self._downloader.trouble(u'ERROR: Unable to extract vid field')
879 yahoo_vid = mobj.group(1)
881 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
882 return self._real_extract(url, new_video=False)
884 # Retrieve video webpage to extract further information
885 request = compat_urllib_request.Request(url)
887 self.report_download_webpage(video_id)
888 webpage = compat_urllib_request.urlopen(request).read()
889 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
890 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
893 # Extract uploader and title from webpage
894 self.report_extraction(video_id)
895 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
897 self._downloader.trouble(u'ERROR: unable to extract video title')
899 video_title = mobj.group(1).decode('utf-8')
901 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video uploader')
905 video_uploader = mobj.group(1).decode('utf-8')
907 # Extract video thumbnail
908 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
910 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
912 video_thumbnail = mobj.group(1).decode('utf-8')
914 # Extract video description
915 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
917 self._downloader.trouble(u'ERROR: unable to extract video description')
919 video_description = mobj.group(1).decode('utf-8')
920 if not video_description:
921 video_description = 'No description available.'
923 # Extract video height and width
924 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video height')
928 yv_video_height = mobj.group(1)
930 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video width')
934 yv_video_width = mobj.group(1)
936 # Retrieve video playlist to extract media URL
937 # I'm not completely sure what all these options are, but we
938 # seem to need most of them, otherwise the server sends a 401.
939 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
940 yv_bitrate = '700' # according to Wikipedia this is hard-coded
941 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
942 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
943 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
945 self.report_download_webpage(video_id)
946 webpage = compat_urllib_request.urlopen(request).read()
947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951 # Extract media URL from playlist XML
952 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
954 self._downloader.trouble(u'ERROR: Unable to extract media URL')
956 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
957 video_url = unescapeHTML(video_url)
960 'id': video_id.decode('utf-8'),
962 'uploader': video_uploader,
964 'title': video_title,
965 'ext': video_extension.decode('utf-8'),
966 'thumbnail': video_thumbnail.decode('utf-8'),
967 'description': video_description,
971 class VimeoIE(InfoExtractor):
972 """Information extractor for vimeo.com."""
974 # _VALID_URL matches Vimeo URLs
975 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
978 def __init__(self, downloader=None):
979 InfoExtractor.__init__(self, downloader)
981 def report_download_webpage(self, video_id):
982 """Report webpage download."""
983 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
985 def report_extraction(self, video_id):
986 """Report information extraction."""
987 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
993 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
996 video_id = mobj.group(1)
998 # Retrieve video webpage to extract further information
999 request = compat_urllib_request.Request(url, None, std_headers)
1001 self.report_download_webpage(video_id)
1002 webpage_bytes = compat_urllib_request.urlopen(request).read()
1003 webpage = webpage_bytes.decode('utf-8')
1004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1008 # Now we begin extracting as much information as we can from what we
1009 # retrieved. First we extract the information common to all extractors,
1010 # and latter we extract those that are Vimeo specific.
1011 self.report_extraction(video_id)
1013 # Extract the config JSON
1015 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1016 config = json.loads(config)
1018 self._downloader.trouble(u'ERROR: unable to extract info section')
1022 video_title = config["video"]["title"]
1024 # Extract uploader and uploader_id
1025 video_uploader = config["video"]["owner"]["name"]
1026 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1028 # Extract video thumbnail
1029 video_thumbnail = config["video"]["thumbnail"]
1031 # Extract video description
1032 video_description = get_element_by_attribute("itemprop", "description", webpage)
1033 if video_description: video_description = clean_html(video_description)
1034 else: video_description = ''
1036 # Extract upload date
1037 video_upload_date = None
1038 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1039 if mobj is not None:
1040 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1042 # Vimeo specific: extract request signature and timestamp
1043 sig = config['request']['signature']
1044 timestamp = config['request']['timestamp']
1046 # Vimeo specific: extract video codec and quality information
1047 # First consider quality, then codecs, then take everything
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 files = { 'hd': [], 'sd': [], 'other': []}
1051 for codec_name, codec_extension in codecs:
1052 if codec_name in config["video"]["files"]:
1053 if 'hd' in config["video"]["files"][codec_name]:
1054 files['hd'].append((codec_name, codec_extension, 'hd'))
1055 elif 'sd' in config["video"]["files"][codec_name]:
1056 files['sd'].append((codec_name, codec_extension, 'sd'))
1058 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1060 for quality in ('hd', 'sd', 'other'):
1061 if len(files[quality]) > 0:
1062 video_quality = files[quality][0][2]
1063 video_codec = files[quality][0][0]
1064 video_extension = files[quality][0][1]
1065 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1068 self._downloader.trouble(u'ERROR: no known codec found')
1071 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1072 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1077 'uploader': video_uploader,
1078 'uploader_id': video_uploader_id,
1079 'upload_date': video_upload_date,
1080 'title': video_title,
1081 'ext': video_extension,
1082 'thumbnail': video_thumbnail,
1083 'description': video_description,
1087 class ArteTvIE(InfoExtractor):
1088 """arte.tv information extractor."""
1090 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1091 _LIVE_URL = r'index-[0-9]+\.html$'
1093 IE_NAME = u'arte.tv'
1095 def __init__(self, downloader=None):
1096 InfoExtractor.__init__(self, downloader)
1098 def report_download_webpage(self, video_id):
1099 """Report webpage download."""
1100 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1102 def report_extraction(self, video_id):
1103 """Report information extraction."""
1104 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1106 def fetch_webpage(self, url):
1107 request = compat_urllib_request.Request(url)
1109 self.report_download_webpage(url)
1110 webpage = compat_urllib_request.urlopen(request).read()
1111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1112 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1114 except ValueError as err:
1115 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1119 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1120 page = self.fetch_webpage(url)
1121 mobj = re.search(regex, page, regexFlags)
1125 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1128 for (i, key, err) in matchTuples:
1129 if mobj.group(i) is None:
1130 self._downloader.trouble(err)
1133 info[key] = mobj.group(i)
1137 def extractLiveStream(self, url):
1138 video_lang = url.split('/')[-4]
1139 info = self.grep_webpage(
1141 r'src="(.*?/videothek_js.*?\.js)',
1144 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1147 http_host = url.split('/')[2]
1148 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1149 info = self.grep_webpage(
1151 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1152 '(http://.*?\.swf).*?' +
1156 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1157 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1158 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1161 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1163 def extractPlus7Stream(self, url):
1164 video_lang = url.split('/')[-3]
1165 info = self.grep_webpage(
1167 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1170 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1173 next_url = compat_urllib_parse.unquote(info.get('url'))
1174 info = self.grep_webpage(
1176 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1179 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1182 next_url = compat_urllib_parse.unquote(info.get('url'))
1184 info = self.grep_webpage(
1186 r'<video id="(.*?)".*?>.*?' +
1187 '<name>(.*?)</name>.*?' +
1188 '<dateVideo>(.*?)</dateVideo>.*?' +
1189 '<url quality="hd">(.*?)</url>',
1192 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1193 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1194 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1195 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1200 'id': info.get('id'),
1201 'url': compat_urllib_parse.unquote(info.get('url')),
1202 'uploader': u'arte.tv',
1203 'upload_date': info.get('date'),
1204 'title': info.get('title').decode('utf-8'),
1210 def _real_extract(self, url):
1211 video_id = url.split('/')[-1]
1212 self.report_extraction(video_id)
1214 if re.search(self._LIVE_URL, video_id) is not None:
1215 self.extractLiveStream(url)
1218 info = self.extractPlus7Stream(url)
1223 class GenericIE(InfoExtractor):
1224 """Generic last-resort information extractor."""
1227 IE_NAME = u'generic'
1229 def __init__(self, downloader=None):
1230 InfoExtractor.__init__(self, downloader)
1232 def report_download_webpage(self, video_id):
1233 """Report webpage download."""
1234 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1235 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1237 def report_extraction(self, video_id):
1238 """Report information extraction."""
1239 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1241 def report_following_redirect(self, new_url):
1242 """Report information extraction."""
1243 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1245 def _test_redirect(self, url):
1246 """Check if it is a redirect, like url shorteners, in case restart chain."""
1247 class HeadRequest(compat_urllib_request.Request):
1248 def get_method(self):
1251 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1253 Subclass the HTTPRedirectHandler to make it use our
1254 HeadRequest also on the redirected URL
1256 def redirect_request(self, req, fp, code, msg, headers, newurl):
1257 if code in (301, 302, 303, 307):
1258 newurl = newurl.replace(' ', '%20')
1259 newheaders = dict((k,v) for k,v in req.headers.items()
1260 if k.lower() not in ("content-length", "content-type"))
1261 return HeadRequest(newurl,
1263 origin_req_host=req.get_origin_req_host(),
1266 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1268 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1270 Fallback to GET if HEAD is not allowed (405 HTTP error)
1272 def http_error_405(self, req, fp, code, msg, headers):
1276 newheaders = dict((k,v) for k,v in req.headers.items()
1277 if k.lower() not in ("content-length", "content-type"))
1278 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1280 origin_req_host=req.get_origin_req_host(),
1284 opener = compat_urllib_request.OpenerDirector()
1285 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1286 HTTPMethodFallback, HEADRedirectHandler,
1287 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1288 opener.add_handler(handler())
1290 response = opener.open(HeadRequest(url))
1291 new_url = response.geturl()
1296 self.report_following_redirect(new_url)
1297 self._downloader.download([new_url])
1300 def _real_extract(self, url):
1301 if self._test_redirect(url): return
1303 video_id = url.split('/')[-1]
1304 request = compat_urllib_request.Request(url)
1306 self.report_download_webpage(video_id)
1307 webpage = compat_urllib_request.urlopen(request).read()
1308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1309 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1311 except ValueError as err:
1312 # since this is the last-resort InfoExtractor, if
1313 # this error is thrown, it'll be thrown here
1314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1317 self.report_extraction(video_id)
1318 # Start with something easy: JW Player in SWFObject
1319 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1321 # Broaden the search a little bit
1322 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1327 # It's possible that one of the regexes
1328 # matched, but returned an empty group:
1329 if mobj.group(1) is None:
1330 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1333 video_url = compat_urllib_parse.unquote(mobj.group(1))
1334 video_id = os.path.basename(video_url)
1336 # here's a fun little line of code for you:
1337 video_extension = os.path.splitext(video_id)[1][1:]
1338 video_id = os.path.splitext(video_id)[0]
1340 # it's tempting to parse this further, but you would
1341 # have to take into account all the variations like
1342 # Video Title - Site Name
1343 # Site Name | Video Title
1344 # Video Title - Tagline | Site Name
1345 # and so on and so forth; it's just not practical
1346 mobj = re.search(r'<title>(.*)</title>', webpage)
1348 self._downloader.trouble(u'ERROR: unable to extract title')
1350 video_title = mobj.group(1)
1352 # video uploader is domain name
1353 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1355 self._downloader.trouble(u'ERROR: unable to extract title')
1357 video_uploader = mobj.group(1)
1362 'uploader': video_uploader,
1363 'upload_date': None,
1364 'title': video_title,
1365 'ext': video_extension,
1369 class YoutubeSearchIE(InfoExtractor):
1370 """Information Extractor for YouTube search queries."""
1371 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1372 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1373 _max_youtube_results = 1000
1374 IE_NAME = u'youtube:search'
1376 def __init__(self, downloader=None):
1377 InfoExtractor.__init__(self, downloader)
1379 def report_download_page(self, query, pagenum):
1380 """Report attempt to download search page with given number."""
1381 query = query.decode(preferredencoding())
1382 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1384 def _real_extract(self, query):
1385 mobj = re.match(self._VALID_URL, query)
1387 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1390 prefix, query = query.split(':')
1392 query = query.encode('utf-8')
1394 self._download_n_results(query, 1)
1396 elif prefix == 'all':
1397 self._download_n_results(query, self._max_youtube_results)
1403 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1405 elif n > self._max_youtube_results:
1406 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1407 n = self._max_youtube_results
1408 self._download_n_results(query, n)
1410 except ValueError: # parsing prefix as integer fails
1411 self._download_n_results(query, 1)
1414 def _download_n_results(self, query, n):
1415 """Downloads a specified number of results for a query"""
1421 while (50 * pagenum) < limit:
1422 self.report_download_page(query, pagenum+1)
1423 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1424 request = compat_urllib_request.Request(result_url)
1426 data = compat_urllib_request.urlopen(request).read()
1427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1430 api_response = json.loads(data)['data']
1432 new_ids = list(video['id'] for video in api_response['items'])
1433 video_ids += new_ids
1435 limit = min(n, api_response['totalItems'])
1438 if len(video_ids) > n:
1439 video_ids = video_ids[:n]
1440 for id in video_ids:
1441 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1445 class GoogleSearchIE(InfoExtractor):
1446 """Information Extractor for Google Video search queries."""
1447 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1448 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1449 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1450 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1451 _max_google_results = 1000
1452 IE_NAME = u'video.google:search'
1454 def __init__(self, downloader=None):
1455 InfoExtractor.__init__(self, downloader)
1457 def report_download_page(self, query, pagenum):
1458 """Report attempt to download playlist page with given number."""
1459 query = query.decode(preferredencoding())
1460 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1462 def _real_extract(self, query):
1463 mobj = re.match(self._VALID_URL, query)
1465 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1468 prefix, query = query.split(':')
1470 query = query.encode('utf-8')
1472 self._download_n_results(query, 1)
1474 elif prefix == 'all':
1475 self._download_n_results(query, self._max_google_results)
1481 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1483 elif n > self._max_google_results:
1484 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1485 n = self._max_google_results
1486 self._download_n_results(query, n)
1488 except ValueError: # parsing prefix as integer fails
1489 self._download_n_results(query, 1)
1492 def _download_n_results(self, query, n):
1493 """Downloads a specified number of results for a query"""
1499 self.report_download_page(query, pagenum)
1500 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1501 request = compat_urllib_request.Request(result_url)
1503 page = compat_urllib_request.urlopen(request).read()
1504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1508 # Extract video identifiers
1509 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1510 video_id = mobj.group(1)
1511 if video_id not in video_ids:
1512 video_ids.append(video_id)
1513 if len(video_ids) == n:
1514 # Specified n videos reached
1515 for id in video_ids:
1516 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1519 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520 for id in video_ids:
1521 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1524 pagenum = pagenum + 1
1527 class YahooSearchIE(InfoExtractor):
1528 """Information Extractor for Yahoo! Video search queries."""
1531 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1532 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1533 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1534 _MORE_PAGES_INDICATOR = r'\s*Next'
1535 _max_yahoo_results = 1000
1536 IE_NAME = u'video.yahoo:search'
1538 def __init__(self, downloader=None):
1539 InfoExtractor.__init__(self, downloader)
1541 def report_download_page(self, query, pagenum):
1542 """Report attempt to download playlist page with given number."""
1543 query = query.decode(preferredencoding())
1544 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1546 def _real_extract(self, query):
1547 mobj = re.match(self._VALID_URL, query)
1549 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1552 prefix, query = query.split(':')
1554 query = query.encode('utf-8')
1556 self._download_n_results(query, 1)
1558 elif prefix == 'all':
1559 self._download_n_results(query, self._max_yahoo_results)
1565 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1567 elif n > self._max_yahoo_results:
1568 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1569 n = self._max_yahoo_results
1570 self._download_n_results(query, n)
1572 except ValueError: # parsing prefix as integer fails
1573 self._download_n_results(query, 1)
1576 def _download_n_results(self, query, n):
1577 """Downloads a specified number of results for a query"""
1580 already_seen = set()
1584 self.report_download_page(query, pagenum)
1585 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1586 request = compat_urllib_request.Request(result_url)
1588 page = compat_urllib_request.urlopen(request).read()
1589 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1590 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1593 # Extract video identifiers
1594 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595 video_id = mobj.group(1)
1596 if video_id not in already_seen:
1597 video_ids.append(video_id)
1598 already_seen.add(video_id)
1599 if len(video_ids) == n:
1600 # Specified n videos reached
1601 for id in video_ids:
1602 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1605 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1606 for id in video_ids:
1607 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1610 pagenum = pagenum + 1
1613 class YoutubePlaylistIE(InfoExtractor):
1614 """Information Extractor for YouTube playlists."""
1616 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1617 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1618 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1619 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1620 IE_NAME = u'youtube:playlist'
1622 def __init__(self, downloader=None):
1623 InfoExtractor.__init__(self, downloader)
1625 def report_download_page(self, playlist_id, pagenum):
1626 """Report attempt to download playlist page with given number."""
1627 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1629 def _real_extract(self, url):
1630 # Extract playlist id
1631 mobj = re.match(self._VALID_URL, url)
1633 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1637 if mobj.group(3) is not None:
1638 self._downloader.download([mobj.group(3)])
1641 # Download playlist pages
1642 # prefix is 'p' as default for playlists but there are other types that need extra care
1643 playlist_prefix = mobj.group(1)
1644 if playlist_prefix == 'a':
1645 playlist_access = 'artist'
1647 playlist_prefix = 'p'
1648 playlist_access = 'view_play_list'
1649 playlist_id = mobj.group(2)
1654 self.report_download_page(playlist_id, pagenum)
1655 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1656 request = compat_urllib_request.Request(url)
1658 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1660 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1663 # Extract video identifiers
1665 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1666 if mobj.group(1) not in ids_in_page:
1667 ids_in_page.append(mobj.group(1))
1668 video_ids.extend(ids_in_page)
1670 if self._MORE_PAGES_INDICATOR not in page:
1672 pagenum = pagenum + 1
1674 total = len(video_ids)
1676 playliststart = self._downloader.params.get('playliststart', 1) - 1
1677 playlistend = self._downloader.params.get('playlistend', -1)
1678 if playlistend == -1:
1679 video_ids = video_ids[playliststart:]
1681 video_ids = video_ids[playliststart:playlistend]
1683 if len(video_ids) == total:
1684 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1686 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1688 for id in video_ids:
1689 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1693 class YoutubeChannelIE(InfoExtractor):
1694 """Information Extractor for YouTube channels."""
1696 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1697 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1698 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1699 IE_NAME = u'youtube:channel'
1701 def report_download_page(self, channel_id, pagenum):
1702 """Report attempt to download channel page with given number."""
1703 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1705 def _real_extract(self, url):
1706 # Extract channel id
1707 mobj = re.match(self._VALID_URL, url)
1709 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1712 # Download channel pages
1713 channel_id = mobj.group(1)
1718 self.report_download_page(channel_id, pagenum)
1719 url = self._TEMPLATE_URL % (channel_id, pagenum)
1720 request = compat_urllib_request.Request(url)
1722 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1727 # Extract video identifiers
1729 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1730 if mobj.group(1) not in ids_in_page:
1731 ids_in_page.append(mobj.group(1))
1732 video_ids.extend(ids_in_page)
1734 if self._MORE_PAGES_INDICATOR not in page:
1736 pagenum = pagenum + 1
1738 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1740 for id in video_ids:
1741 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1745 class YoutubeUserIE(InfoExtractor):
1746 """Information Extractor for YouTube users."""
1748 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1749 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1750 _GDATA_PAGE_SIZE = 50
1751 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1752 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1753 IE_NAME = u'youtube:user'
1755 def __init__(self, downloader=None):
1756 InfoExtractor.__init__(self, downloader)
1758 def report_download_page(self, username, start_index):
1759 """Report attempt to download user page."""
1760 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1761 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1763 def _real_extract(self, url):
1765 mobj = re.match(self._VALID_URL, url)
1767 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1770 username = mobj.group(1)
1772 # Download video ids using YouTube Data API. Result size per
1773 # query is limited (currently to 50 videos) so we need to query
1774 # page by page until there are no video ids - it means we got
1781 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1782 self.report_download_page(username, start_index)
1784 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1787 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1792 # Extract video identifiers
1795 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796 if mobj.group(1) not in ids_in_page:
1797 ids_in_page.append(mobj.group(1))
1799 video_ids.extend(ids_in_page)
1801 # A little optimization - if current page is not
1802 # "full", ie. does not contain PAGE_SIZE video ids then
1803 # we can assume that this page is the last one - there
1804 # are no more ids on further pages - no need to query
1807 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1812 all_ids_count = len(video_ids)
1813 playliststart = self._downloader.params.get('playliststart', 1) - 1
1814 playlistend = self._downloader.params.get('playlistend', -1)
1816 if playlistend == -1:
1817 video_ids = video_ids[playliststart:]
1819 video_ids = video_ids[playliststart:playlistend]
1821 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1822 (username, all_ids_count, len(video_ids)))
1824 for video_id in video_ids:
1825 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1828 class BlipTVUserIE(InfoExtractor):
1829 """Information Extractor for blip.tv users."""
1831 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1833 IE_NAME = u'blip.tv:user'
1835 def __init__(self, downloader=None):
1836 InfoExtractor.__init__(self, downloader)
1838 def report_download_page(self, username, pagenum):
1839 """Report attempt to download user page."""
1840 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1841 (self.IE_NAME, username, pagenum))
1843 def _real_extract(self, url):
1845 mobj = re.match(self._VALID_URL, url)
1847 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1850 username = mobj.group(1)
1852 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1854 request = compat_urllib_request.Request(url)
1857 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1858 mobj = re.search(r'data-users-id="([^"]+)"', page)
1859 page_base = page_base % mobj.group(1)
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1865 # Download video ids using BlipTV Ajax calls. Result size per
1866 # query is limited (currently to 12 videos) so we need to query
1867 # page by page until there are no video ids - it means we got
1874 self.report_download_page(username, pagenum)
1876 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1879 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1880 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1881 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1884 # Extract video identifiers
1887 for mobj in re.finditer(r'href="/([^"]+)"', page):
1888 if mobj.group(1) not in ids_in_page:
1889 ids_in_page.append(unescapeHTML(mobj.group(1)))
1891 video_ids.extend(ids_in_page)
1893 # A little optimization - if current page is not
1894 # "full", ie. does not contain PAGE_SIZE video ids then
1895 # we can assume that this page is the last one - there
1896 # are no more ids on further pages - no need to query
1899 if len(ids_in_page) < self._PAGE_SIZE:
1904 all_ids_count = len(video_ids)
1905 playliststart = self._downloader.params.get('playliststart', 1) - 1
1906 playlistend = self._downloader.params.get('playlistend', -1)
1908 if playlistend == -1:
1909 video_ids = video_ids[playliststart:]
1911 video_ids = video_ids[playliststart:playlistend]
1913 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1914 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1916 for video_id in video_ids:
1917 self._downloader.download([u'http://blip.tv/'+video_id])
1920 class DepositFilesIE(InfoExtractor):
1921 """Information extractor for depositfiles.com"""
1923 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1925 def report_download_webpage(self, file_id):
1926 """Report webpage download."""
1927 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1929 def report_extraction(self, file_id):
1930 """Report information extraction."""
1931 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1933 def _real_extract(self, url):
1934 file_id = url.split('/')[-1]
1935 # Rebuild url in english locale
1936 url = 'http://depositfiles.com/en/files/' + file_id
1938 # Retrieve file webpage with 'Free download' button pressed
1939 free_download_indication = { 'gateway_result' : '1' }
1940 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1942 self.report_download_webpage(file_id)
1943 webpage = compat_urllib_request.urlopen(request).read()
1944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1945 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1948 # Search for the real file URL
1949 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1950 if (mobj is None) or (mobj.group(1) is None):
1951 # Try to figure out reason of the error.
1952 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1953 if (mobj is not None) and (mobj.group(1) is not None):
1954 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1955 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1957 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1960 file_url = mobj.group(1)
1961 file_extension = os.path.splitext(file_url)[1][1:]
1963 # Search for file title
1964 mobj = re.search(r'<b title="(.*?)">', webpage)
1966 self._downloader.trouble(u'ERROR: unable to extract title')
1968 file_title = mobj.group(1).decode('utf-8')
1971 'id': file_id.decode('utf-8'),
1972 'url': file_url.decode('utf-8'),
1974 'upload_date': None,
1975 'title': file_title,
1976 'ext': file_extension.decode('utf-8'),
1980 class FacebookIE(InfoExtractor):
1981 """Information Extractor for Facebook"""
1984 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986 _NETRC_MACHINE = 'facebook'
1987 _available_formats = ['video', 'highqual', 'lowqual']
1988 _video_extensions = {
1993 IE_NAME = u'facebook'
1995 def __init__(self, downloader=None):
1996 InfoExtractor.__init__(self, downloader)
1998 def _reporter(self, message):
1999 """Add header and report message."""
2000 self._downloader.to_screen(u'[facebook] %s' % message)
2002 def report_login(self):
2003 """Report attempt to log in."""
2004 self._reporter(u'Logging in')
2006 def report_video_webpage_download(self, video_id):
2007 """Report attempt to download video webpage."""
2008 self._reporter(u'%s: Downloading video webpage' % video_id)
2010 def report_information_extraction(self, video_id):
2011 """Report attempt to extract video information."""
2012 self._reporter(u'%s: Extracting video information' % video_id)
2014 def _parse_page(self, video_webpage):
2015 """Extract video information from page"""
2017 data = {'title': r'\("video_title", "(.*?)"\)',
2018 'description': r'<div class="datawrap">(.*?)</div>',
2019 'owner': r'\("video_owner_name", "(.*?)"\)',
2020 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2023 for piece in data.keys():
2024 mobj = re.search(data[piece], video_webpage)
2025 if mobj is not None:
2026 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030 for fmt in self._available_formats:
2031 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2032 if mobj is not None:
2033 # URL is in a Javascript segment inside an escaped Unicode format within
2034 # the generally utf-8 page
2035 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2036 video_info['video_urls'] = video_urls
2040 def _real_initialize(self):
2041 if self._downloader is None:
2046 downloader_params = self._downloader.params
2048 # Attempt to use provided username and password or .netrc data
2049 if downloader_params.get('username', None) is not None:
2050 useremail = downloader_params['username']
2051 password = downloader_params['password']
2052 elif downloader_params.get('usenetrc', False):
2054 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2055 if info is not None:
2059 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2060 except (IOError, netrc.NetrcParseError) as err:
2061 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2064 if useremail is None:
2073 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2076 login_results = compat_urllib_request.urlopen(request).read()
2077 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2078 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2080 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2084 def _real_extract(self, url):
2085 mobj = re.match(self._VALID_URL, url)
2087 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2089 video_id = mobj.group('ID')
2092 self.report_video_webpage_download(video_id)
2093 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2095 page = compat_urllib_request.urlopen(request)
2096 video_webpage = page.read()
2097 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2098 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2101 # Start extracting information
2102 self.report_information_extraction(video_id)
2104 # Extract information
2105 video_info = self._parse_page(video_webpage)
2108 if 'owner' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2111 video_uploader = video_info['owner']
2114 if 'title' not in video_info:
2115 self._downloader.trouble(u'ERROR: unable to extract video title')
2117 video_title = video_info['title']
2118 video_title = video_title.decode('utf-8')
2121 if 'thumbnail' not in video_info:
2122 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2123 video_thumbnail = ''
2125 video_thumbnail = video_info['thumbnail']
2129 if 'upload_date' in video_info:
2130 upload_time = video_info['upload_date']
2131 timetuple = email.utils.parsedate_tz(upload_time)
2132 if timetuple is not None:
2134 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2139 video_description = video_info.get('description', 'No description available.')
2141 url_map = video_info['video_urls']
2143 # Decide which formats to download
2144 req_format = self._downloader.params.get('format', None)
2145 format_limit = self._downloader.params.get('format_limit', None)
2147 if format_limit is not None and format_limit in self._available_formats:
2148 format_list = self._available_formats[self._available_formats.index(format_limit):]
2150 format_list = self._available_formats
2151 existing_formats = [x for x in format_list if x in url_map]
2152 if len(existing_formats) == 0:
2153 self._downloader.trouble(u'ERROR: no known formats available for video')
2155 if req_format is None:
2156 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2157 elif req_format == 'worst':
2158 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2159 elif req_format == '-1':
2160 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2163 if req_format not in url_map:
2164 self._downloader.trouble(u'ERROR: requested format not available')
2166 video_url_list = [(req_format, url_map[req_format])] # Specific format
2169 for format_param, video_real_url in video_url_list:
2171 video_extension = self._video_extensions.get(format_param, 'mp4')
2174 'id': video_id.decode('utf-8'),
2175 'url': video_real_url.decode('utf-8'),
2176 'uploader': video_uploader.decode('utf-8'),
2177 'upload_date': upload_date,
2178 'title': video_title,
2179 'ext': video_extension.decode('utf-8'),
2180 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2181 'thumbnail': video_thumbnail.decode('utf-8'),
2182 'description': video_description.decode('utf-8'),
2186 class BlipTVIE(InfoExtractor):
2187 """Information extractor for blip.tv"""
2189 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2190 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2191 IE_NAME = u'blip.tv'
2193 def report_extraction(self, file_id):
2194 """Report information extraction."""
2195 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2197 def report_direct_download(self, title):
2198 """Report information extraction."""
2199 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2201 def _real_extract(self, url):
2202 mobj = re.match(self._VALID_URL, url)
2204 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2211 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2212 request = compat_urllib_request.Request(json_url)
2213 request.add_header('User-Agent', 'iTunes/10.6.1')
2214 self.report_extraction(mobj.group(1))
2217 urlh = compat_urllib_request.urlopen(request)
2218 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2219 basename = url.split('/')[-1]
2220 title,ext = os.path.splitext(basename)
2221 title = title.decode('UTF-8')
2222 ext = ext.replace('.', '')
2223 self.report_direct_download(title)
2228 'upload_date': None,
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2235 if info is None: # Regular URL
2237 json_code_bytes = urlh.read()
2238 json_code = json_code_bytes.decode('utf-8')
2239 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2240 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2244 json_data = json.loads(json_code)
2245 if 'Post' in json_data:
2246 data = json_data['Post']
2250 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2251 video_url = data['media']['url']
2252 umobj = re.match(self._URL_EXT, video_url)
2254 raise ValueError('Can not determine filename extension')
2255 ext = umobj.group(1)
2258 'id': data['item_id'],
2260 'uploader': data['display_name'],
2261 'upload_date': upload_date,
2262 'title': data['title'],
2264 'format': data['media']['mimeType'],
2265 'thumbnail': data['thumbnailUrl'],
2266 'description': data['description'],
2267 'player_url': data['embedUrl'],
2268 'user_agent': 'iTunes/10.6.1',
2270 except (ValueError,KeyError) as err:
2271 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2277 class MyVideoIE(InfoExtractor):
2278 """Information Extractor for myvideo.de."""
2280 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2281 IE_NAME = u'myvideo'
2283 def __init__(self, downloader=None):
2284 InfoExtractor.__init__(self, downloader)
2286 def report_extraction(self, video_id):
2287 """Report information extraction."""
2288 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2290 def _real_extract(self,url):
2291 mobj = re.match(self._VALID_URL, url)
2293 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2296 video_id = mobj.group(1)
2299 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2300 webpage = self._download_webpage(webpage_url, video_id)
2302 self.report_extraction(video_id)
2303 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2306 self._downloader.trouble(u'ERROR: unable to extract media URL')
2308 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2310 mobj = re.search('<title>([^<]+)</title>', webpage)
2312 self._downloader.trouble(u'ERROR: unable to extract title')
2315 video_title = mobj.group(1)
2321 'upload_date': None,
2322 'title': video_title,
2326 class ComedyCentralIE(InfoExtractor):
2327 """Information extractor for The Daily Show and Colbert Report """
2329 # urls can be abbreviations like :thedailyshow or :colbert
2330 # urls for episodes like:
2331 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2332 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2333 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2334 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2335 |(https?://)?(www\.)?
2336 (?P<showname>thedailyshow|colbertnation)\.com/
2337 (full-episodes/(?P<episode>.*)|
2339 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2340 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2343 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2345 _video_extensions = {
2353 _video_dimensions = {
2362 def suitable(self, url):
2363 """Receives a URL and returns True if suitable for this IE."""
2364 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2366 def report_extraction(self, episode_id):
2367 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2369 def report_config_download(self, episode_id, media_id):
2370 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2372 def report_index_download(self, episode_id):
2373 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2375 def _print_formats(self, formats):
2376 print('Available formats:')
2378 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2381 def _real_extract(self, url):
2382 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2384 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2387 if mobj.group('shortname'):
2388 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2389 url = u'http://www.thedailyshow.com/full-episodes/'
2391 url = u'http://www.colbertnation.com/full-episodes/'
2392 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2393 assert mobj is not None
2395 if mobj.group('clip'):
2396 if mobj.group('showname') == 'thedailyshow':
2397 epTitle = mobj.group('tdstitle')
2399 epTitle = mobj.group('cntitle')
2402 dlNewest = not mobj.group('episode')
2404 epTitle = mobj.group('showname')
2406 epTitle = mobj.group('episode')
2408 req = compat_urllib_request.Request(url)
2409 self.report_extraction(epTitle)
2411 htmlHandle = compat_urllib_request.urlopen(req)
2412 html = htmlHandle.read()
2413 webpage = html.decode('utf-8')
2414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2415 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2418 url = htmlHandle.geturl()
2419 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2421 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2423 if mobj.group('episode') == '':
2424 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2426 epTitle = mobj.group('episode')
2428 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2430 if len(mMovieParams) == 0:
2431 # The Colbert Report embeds the information in a without
2432 # a URL prefix; so extract the alternate reference
2433 # and then add the URL prefix manually.
2435 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2436 if len(altMovieParams) == 0:
2437 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2440 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2442 uri = mMovieParams[0][1]
2443 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2444 self.report_index_download(epTitle)
2446 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2448 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2453 idoc = xml.etree.ElementTree.fromstring(indexXml)
2454 itemEls = idoc.findall('.//item')
2455 for partNum,itemEl in enumerate(itemEls):
2456 mediaId = itemEl.findall('./guid')[0].text
2457 shortMediaId = mediaId.split(':')[-1]
2458 showId = mediaId.split(':')[-2].replace('.com', '')
2459 officialTitle = itemEl.findall('./title')[0].text
2460 officialDate = itemEl.findall('./pubDate')[0].text
2462 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2463 compat_urllib_parse.urlencode({'uri': mediaId}))
2464 configReq = compat_urllib_request.Request(configUrl)
2465 self.report_config_download(epTitle, shortMediaId)
2467 configXml = compat_urllib_request.urlopen(configReq).read()
2468 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2472 cdoc = xml.etree.ElementTree.fromstring(configXml)
2474 for rendition in cdoc.findall('.//rendition'):
2475 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2479 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2482 if self._downloader.params.get('listformats', None):
2483 self._print_formats([i[0] for i in turls])
2486 # For now, just pick the highest bitrate
2487 format,rtmp_video_url = turls[-1]
2489 # Get the format arg from the arg stream
2490 req_format = self._downloader.params.get('format', None)
2492 # Select format if we can find one
2495 format, rtmp_video_url = f, v
2498 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2500 raise ExtractorError(u'Cannot transform RTMP url')
2501 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2502 video_url = base + m.group('finalid')
2504 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2509 'upload_date': officialDate,
2514 'description': officialTitle,
2516 results.append(info)
2521 class EscapistIE(InfoExtractor):
2522 """Information extractor for The Escapist """
2524 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2525 IE_NAME = u'escapist'
2527 def report_extraction(self, showName):
2528 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2530 def report_config_download(self, showName):
2531 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2533 def _real_extract(self, url):
2534 mobj = re.match(self._VALID_URL, url)
2536 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2538 showName = mobj.group('showname')
2539 videoId = mobj.group('episode')
2541 self.report_extraction(showName)
2543 webPage = compat_urllib_request.urlopen(url)
2544 webPageBytes = webPage.read()
2545 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2546 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2551 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2552 description = unescapeHTML(descMatch.group(1))
2553 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2554 imgUrl = unescapeHTML(imgMatch.group(1))
2555 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2556 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2557 configUrlMatch = re.search('config=(.*)$', playerUrl)
2558 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2560 self.report_config_download(showName)
2562 configJSON = compat_urllib_request.urlopen(configUrl)
2563 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2564 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2569 # Technically, it's JavaScript, not JSON
2570 configJSON = configJSON.replace("'", '"')
2573 config = json.loads(configJSON)
2574 except (ValueError,) as err:
2575 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2578 playlist = config['playlist']
2579 videoUrl = playlist[1]['url']
2584 'uploader': showName,
2585 'upload_date': None,
2588 'thumbnail': imgUrl,
2589 'description': description,
2590 'player_url': playerUrl,
2595 class CollegeHumorIE(InfoExtractor):
2596 """Information extractor for collegehumor.com"""
2599 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2600 IE_NAME = u'collegehumor'
2602 def report_manifest(self, video_id):
2603 """Report information extraction."""
2604 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2606 def report_extraction(self, video_id):
2607 """Report information extraction."""
2608 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2610 def _real_extract(self, url):
2611 mobj = re.match(self._VALID_URL, url)
2613 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2615 video_id = mobj.group('videoid')
2620 'upload_date': None,
2623 self.report_extraction(video_id)
2624 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2626 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2631 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2633 videoNode = mdoc.findall('./video')[0]
2634 info['description'] = videoNode.findall('./description')[0].text
2635 info['title'] = videoNode.findall('./caption')[0].text
2636 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2637 manifest_url = videoNode.findall('./file')[0].text
2639 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2642 manifest_url += '?hdcore=2.10.3'
2643 self.report_manifest(video_id)
2645 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2646 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2647 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2650 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2652 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2653 node_id = media_node.attrib['url']
2654 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2655 except IndexError as err:
2656 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2659 url_pr = compat_urllib_parse_urlparse(manifest_url)
2660 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2667 class XVideosIE(InfoExtractor):
2668 """Information extractor for xvideos.com"""
2670 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2671 IE_NAME = u'xvideos'
2673 def report_extraction(self, video_id):
2674 """Report information extraction."""
2675 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2677 def _real_extract(self, url):
2678 mobj = re.match(self._VALID_URL, url)
2680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2682 video_id = mobj.group(1)
2684 webpage = self._download_webpage(url, video_id)
2686 self.report_extraction(video_id)
2690 mobj = re.search(r'flv_url=(.+?)&', webpage)
2692 self._downloader.trouble(u'ERROR: unable to extract video url')
2694 video_url = compat_urllib_parse.unquote(mobj.group(1))
2698 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2700 self._downloader.trouble(u'ERROR: unable to extract video title')
2702 video_title = mobj.group(1)
2705 # Extract video thumbnail
2706 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2708 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2710 video_thumbnail = mobj.group(0)
2716 'upload_date': None,
2717 'title': video_title,
2719 'thumbnail': video_thumbnail,
2720 'description': None,
2726 class SoundcloudIE(InfoExtractor):
2727 """Information extractor for soundcloud.com
2728 To access the media, the uid of the song and a stream token
2729 must be extracted from the page source and the script must make
2730 a request to media.soundcloud.com/crossdomain.xml. Then
2731 the media can be grabbed by requesting from an url composed
2732 of the stream token and uid
2735 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2736 IE_NAME = u'soundcloud'
2738 def __init__(self, downloader=None):
2739 InfoExtractor.__init__(self, downloader)
2741 def report_resolve(self, video_id):
2742 """Report information extraction."""
2743 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2745 def report_extraction(self, video_id):
2746 """Report information extraction."""
2747 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2749 def _real_extract(self, url):
2750 mobj = re.match(self._VALID_URL, url)
2752 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2755 # extract uploader (which is in the url)
2756 uploader = mobj.group(1)
2757 # extract simple title (uploader + slug of song title)
2758 slug_title = mobj.group(2)
2759 simple_title = uploader + u'-' + slug_title
2761 self.report_resolve('%s/%s' % (uploader, slug_title))
2763 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2764 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2765 request = compat_urllib_request.Request(resolv_url)
2767 info_json_bytes = compat_urllib_request.urlopen(request).read()
2768 info_json = info_json_bytes.decode('utf-8')
2769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2770 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2773 info = json.loads(info_json)
2774 video_id = info['id']
2775 self.report_extraction('%s/%s' % (uploader, slug_title))
2777 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778 request = compat_urllib_request.Request(streams_url)
2780 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2781 stream_json = stream_json_bytes.decode('utf-8')
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2786 streams = json.loads(stream_json)
2787 mediaURL = streams['http_mp3_128_url']
2792 'uploader': info['user']['username'],
2793 'upload_date': info['created_at'],
2794 'title': info['title'],
2796 'description': info['description'],
2800 class InfoQIE(InfoExtractor):
2801 """Information extractor for infoq.com"""
2802 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2804 def report_extraction(self, video_id):
2805 """Report information extraction."""
2806 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2808 def _real_extract(self, url):
2809 mobj = re.match(self._VALID_URL, url)
2811 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2814 webpage = self._download_webpage(url, video_id=url)
2815 self.report_extraction(url)
2818 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2820 self._downloader.trouble(u'ERROR: unable to extract video url')
2822 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2823 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2826 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2828 self._downloader.trouble(u'ERROR: unable to extract video title')
2830 video_title = mobj.group(1)
2832 # Extract description
2833 video_description = u'No description available.'
2834 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2835 if mobj is not None:
2836 video_description = mobj.group(1)
2838 video_filename = video_url.split('/')[-1]
2839 video_id, extension = video_filename.split('.')
2845 'upload_date': None,
2846 'title': video_title,
2847 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2849 'description': video_description,
2854 class MixcloudIE(InfoExtractor):
2855 """Information extractor for www.mixcloud.com"""
2857 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2858 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2859 IE_NAME = u'mixcloud'
2861 def __init__(self, downloader=None):
2862 InfoExtractor.__init__(self, downloader)
2864 def report_download_json(self, file_id):
2865 """Report JSON download."""
2866 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2868 def report_extraction(self, file_id):
2869 """Report information extraction."""
2870 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2872 def get_urls(self, jsonData, fmt, bitrate='best'):
2873 """Get urls from 'audio_formats' section in json"""
2876 bitrate_list = jsonData[fmt]
2877 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2878 bitrate = max(bitrate_list) # select highest
2880 url_list = jsonData[fmt][bitrate]
2881 except TypeError: # we have no bitrate info.
2882 url_list = jsonData[fmt]
2885 def check_urls(self, url_list):
2886 """Returns 1st active url from list"""
2887 for url in url_list:
2889 compat_urllib_request.urlopen(url)
2891 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896 def _print_formats(self, formats):
2897 print('Available formats:')
2898 for fmt in formats.keys():
2899 for b in formats[fmt]:
2901 ext = formats[fmt][b][0]
2902 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2903 except TypeError: # we have no bitrate info
2904 ext = formats[fmt][0]
2905 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2908 def _real_extract(self, url):
2909 mobj = re.match(self._VALID_URL, url)
2911 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2913 # extract uploader & filename from url
2914 uploader = mobj.group(1).decode('utf-8')
2915 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2917 # construct API request
2918 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2919 # retrieve .json file with links to files
2920 request = compat_urllib_request.Request(file_url)
2922 self.report_download_json(file_url)
2923 jsonData = compat_urllib_request.urlopen(request).read()
2924 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2925 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2929 json_data = json.loads(jsonData)
2930 player_url = json_data['player_swf_url']
2931 formats = dict(json_data['audio_formats'])
2933 req_format = self._downloader.params.get('format', None)
2936 if self._downloader.params.get('listformats', None):
2937 self._print_formats(formats)
2940 if req_format is None or req_format == 'best':
2941 for format_param in formats.keys():
2942 url_list = self.get_urls(formats, format_param)
2944 file_url = self.check_urls(url_list)
2945 if file_url is not None:
2948 if req_format not in formats:
2949 self._downloader.trouble(u'ERROR: format is not available')
2952 url_list = self.get_urls(formats, req_format)
2953 file_url = self.check_urls(url_list)
2954 format_param = req_format
2957 'id': file_id.decode('utf-8'),
2958 'url': file_url.decode('utf-8'),
2959 'uploader': uploader.decode('utf-8'),
2960 'upload_date': None,
2961 'title': json_data['name'],
2962 'ext': file_url.split('.')[-1].decode('utf-8'),
2963 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2964 'thumbnail': json_data['thumbnail_url'],
2965 'description': json_data['description'],
2966 'player_url': player_url.decode('utf-8'),
2969 class StanfordOpenClassroomIE(InfoExtractor):
2970 """Information extractor for Stanford's Open ClassRoom"""
2972 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2973 IE_NAME = u'stanfordoc'
2975 def report_download_webpage(self, objid):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2979 def report_extraction(self, video_id):
2980 """Report information extraction."""
2981 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2983 def _real_extract(self, url):
2984 mobj = re.match(self._VALID_URL, url)
2986 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2989 if mobj.group('course') and mobj.group('video'): # A specific video
2990 course = mobj.group('course')
2991 video = mobj.group('video')
2993 'id': course + '_' + video,
2995 'upload_date': None,
2998 self.report_extraction(info['id'])
2999 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3000 xmlUrl = baseUrl + video + '.xml'
3002 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3004 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3006 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3008 info['title'] = mdoc.findall('./title')[0].text
3009 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3011 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3013 info['ext'] = info['url'].rpartition('.')[2]
3015 elif mobj.group('course'): # A course page
3016 course = mobj.group('course')
3021 'upload_date': None,
3024 self.report_download_webpage(info['id'])
3026 coursepage = compat_urllib_request.urlopen(url).read()
3027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3031 m = re.search('<h1>([^<]+)</h1>', coursepage)
3033 info['title'] = unescapeHTML(m.group(1))
3035 info['title'] = info['id']
3037 m = re.search('<description>([^<]+)</description>', coursepage)
3039 info['description'] = unescapeHTML(m.group(1))
3041 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3044 'type': 'reference',
3045 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3049 for entry in info['list']:
3050 assert entry['type'] == 'reference'
3051 results += self.extract(entry['url'])
3056 'id': 'Stanford OpenClassroom',
3059 'upload_date': None,
3062 self.report_download_webpage(info['id'])
3063 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3065 rootpage = compat_urllib_request.urlopen(rootURL).read()
3066 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3067 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3070 info['title'] = info['id']
3072 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3075 'type': 'reference',
3076 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3081 for entry in info['list']:
3082 assert entry['type'] == 'reference'
3083 results += self.extract(entry['url'])
3086 class MTVIE(InfoExtractor):
3087 """Information extractor for MTV.com"""
3089 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3092 def report_extraction(self, video_id):
3093 """Report information extraction."""
3094 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3096 def _real_extract(self, url):
3097 mobj = re.match(self._VALID_URL, url)
3099 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3101 if not mobj.group('proto'):
3102 url = 'http://' + url
3103 video_id = mobj.group('videoid')
3105 webpage = self._download_webpage(url, video_id)
3107 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3109 self._downloader.trouble(u'ERROR: unable to extract song name')
3111 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3112 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3114 self._downloader.trouble(u'ERROR: unable to extract performer')
3116 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3117 video_title = performer + ' - ' + song_name
3119 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3121 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3123 mtvn_uri = mobj.group(1)
3125 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3127 self._downloader.trouble(u'ERROR: unable to extract content id')
3129 content_id = mobj.group(1)
3131 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3132 self.report_extraction(video_id)
3133 request = compat_urllib_request.Request(videogen_url)
3135 metadataXml = compat_urllib_request.urlopen(request).read()
3136 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3137 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3140 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3141 renditions = mdoc.findall('.//rendition')
3143 # For now, always pick the highest quality.
3144 rendition = renditions[-1]
3147 _,_,ext = rendition.attrib['type'].partition('/')
3148 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3149 video_url = rendition.find('./src').text
3151 self._downloader.trouble('Invalid rendition field.')
3157 'uploader': performer,
3158 'upload_date': None,
3159 'title': video_title,
3167 class YoukuIE(InfoExtractor):
3168 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3170 def report_download_webpage(self, file_id):
3171 """Report webpage download."""
3172 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3174 def report_extraction(self, file_id):
3175 """Report information extraction."""
3176 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3179 nowTime = int(time.time() * 1000)
3180 random1 = random.randint(1000,1998)
3181 random2 = random.randint(1000,9999)
3183 return "%d%d%d" %(nowTime,random1,random2)
3185 def _get_file_ID_mix_string(self, seed):
3187 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3189 for i in range(len(source)):
3190 seed = (seed * 211 + 30031 ) % 65536
3191 index = math.floor(seed / 65536 * len(source) )
3192 mixed.append(source[int(index)])
3193 source.remove(source[int(index)])
3194 #return ''.join(mixed)
3197 def _get_file_id(self, fileId, seed):
3198 mixed = self._get_file_ID_mix_string(seed)
3199 ids = fileId.split('*')
3203 realId.append(mixed[int(ch)])
3204 return ''.join(realId)
3206 def _real_extract(self, url):
3207 mobj = re.match(self._VALID_URL, url)
3209 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3211 video_id = mobj.group('ID')
3213 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3215 request = compat_urllib_request.Request(info_url, None, std_headers)
3217 self.report_download_webpage(video_id)
3218 jsondata = compat_urllib_request.urlopen(request).read()
3219 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3220 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3223 self.report_extraction(video_id)
3225 jsonstr = jsondata.decode('utf-8')
3226 config = json.loads(jsonstr)
3228 video_title = config['data'][0]['title']
3229 seed = config['data'][0]['seed']
3231 format = self._downloader.params.get('format', None)
3232 supported_format = list(config['data'][0]['streamfileids'].keys())
3234 if format is None or format == 'best':
3235 if 'hd2' in supported_format:
3240 elif format == 'worst':
3248 fileid = config['data'][0]['streamfileids'][format]
3249 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3250 except (UnicodeDecodeError, ValueError, KeyError):
3251 self._downloader.trouble(u'ERROR: unable to extract info section')
3255 sid = self._gen_sid()
3256 fileid = self._get_file_id(fileid, seed)
3258 #column 8,9 of fileid represent the segment number
3259 #fileid[7:9] should be changed
3260 for index, key in enumerate(keys):
3262 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3263 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3266 'id': '%s_part%02d' % (video_id, index),
3267 'url': download_url,
3269 'upload_date': None,
3270 'title': video_title,
3273 files_info.append(info)
3278 class XNXXIE(InfoExtractor):
3279 """Information extractor for xnxx.com"""
3281 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3283 VIDEO_URL_RE = r'flv_url=(.*?)&'
3284 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3285 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3287 def report_webpage(self, video_id):
3288 """Report information extraction"""
3289 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3291 def report_extraction(self, video_id):
3292 """Report information extraction"""
3293 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3295 def _real_extract(self, url):
3296 mobj = re.match(self._VALID_URL, url)
3298 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3300 video_id = mobj.group(1)
3302 self.report_webpage(video_id)
3304 # Get webpage content
3306 webpage_bytes = compat_urllib_request.urlopen(url).read()
3307 webpage = webpage_bytes.decode('utf-8')
3308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3312 result = re.search(self.VIDEO_URL_RE, webpage)
3314 self._downloader.trouble(u'ERROR: unable to extract video url')
3316 video_url = compat_urllib_parse.unquote(result.group(1))
3318 result = re.search(self.VIDEO_TITLE_RE, webpage)
3320 self._downloader.trouble(u'ERROR: unable to extract video title')
3322 video_title = result.group(1)
3324 result = re.search(self.VIDEO_THUMB_RE, webpage)
3326 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3328 video_thumbnail = result.group(1)
3334 'upload_date': None,
3335 'title': video_title,
3337 'thumbnail': video_thumbnail,
3338 'description': None,
3342 class GooglePlusIE(InfoExtractor):
3343 """Information extractor for plus.google.com."""
3345 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3346 IE_NAME = u'plus.google'
3348 def __init__(self, downloader=None):
3349 InfoExtractor.__init__(self, downloader)
3351 def report_extract_entry(self, url):
3352 """Report downloading extry"""
3353 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3355 def report_date(self, upload_date):
3356 """Report downloading extry"""
3357 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3359 def report_uploader(self, uploader):
3360 """Report downloading extry"""
3361 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3363 def report_title(self, video_title):
3364 """Report downloading extry"""
3365 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3367 def report_extract_vid_page(self, video_page):
3368 """Report information extraction."""
3369 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3371 def _real_extract(self, url):
3372 # Extract id from URL
3373 mobj = re.match(self._VALID_URL, url)
3375 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3378 post_url = mobj.group(0)
3379 video_id = mobj.group(1)
3381 video_extension = 'flv'
3383 # Step 1, Retrieve post webpage to extract further information
3384 self.report_extract_entry(post_url)
3385 request = compat_urllib_request.Request(post_url)
3387 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3392 # Extract update date
3394 pattern = 'title="Timestamp">(.*?)</a>'
3395 mobj = re.search(pattern, webpage)
3397 upload_date = mobj.group(1)
3398 # Convert timestring to a format suitable for filename
3399 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3400 upload_date = upload_date.strftime('%Y%m%d')
3401 self.report_date(upload_date)
3405 pattern = r'rel\="author".*?>(.*?)</a>'
3406 mobj = re.search(pattern, webpage)
3408 uploader = mobj.group(1)
3409 self.report_uploader(uploader)
3412 # Get the first line for title
3414 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3415 mobj = re.search(pattern, webpage)
3417 video_title = mobj.group(1)
3418 self.report_title(video_title)
3420 # Step 2, Stimulate clicking the image box to launch video
3421 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3422 mobj = re.search(pattern, webpage)
3424 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3426 video_page = mobj.group(1)
3427 request = compat_urllib_request.Request(video_page)
3429 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3430 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3431 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3433 self.report_extract_vid_page(video_page)
3436 # Extract video links on video page
3437 """Extract video links of all sizes"""
3438 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3439 mobj = re.findall(pattern, webpage)
3441 self._downloader.trouble(u'ERROR: unable to extract video links')
3443 # Sort in resolution
3444 links = sorted(mobj)
3446 # Choose the lowest of the sort, i.e. highest resolution
3447 video_url = links[-1]
3448 # Only get the url. The resolution part in the tuple has no use anymore
3449 video_url = video_url[-1]
3450 # Treat escaped \u0026 style hex
3452 video_url = video_url.decode("unicode_escape")
3453 except AttributeError: # Python 3
3454 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3460 'uploader': uploader,
3461 'upload_date': upload_date,
3462 'title': video_title,
3463 'ext': video_extension,
3466 class NBAIE(InfoExtractor):
3467 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3470 def _real_extract(self, url):
3471 mobj = re.match(self._VALID_URL, url)
3473 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3476 video_id = mobj.group(1)
3477 if video_id.endswith('/index.html'):
3478 video_id = video_id[:-len('/index.html')]
3480 webpage = self._download_webpage(url, video_id)
3482 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3483 def _findProp(rexp, default=None):
3484 m = re.search(rexp, webpage)
3486 return unescapeHTML(m.group(1))
3490 shortened_video_id = video_id.rpartition('/')[2]
3491 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3493 'id': shortened_video_id,
3497 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3498 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3502 class JustinTVIE(InfoExtractor):
3503 """Information extractor for justin.tv and twitch.tv"""
3504 # TODO: One broadcast may be split into multiple videos. The key
3505 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3506 # starts at 1 and increases. Can we treat all parts as one video?
3508 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3509 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3510 _JUSTIN_PAGE_LIMIT = 100
3511 IE_NAME = u'justin.tv'
3513 def report_extraction(self, file_id):
3514 """Report information extraction."""
3515 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3517 def report_download_page(self, channel, offset):
3518 """Report attempt to download a single page of videos."""
3519 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3520 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3522 # Return count of items, list of *valid* items
3523 def _parse_page(self, url):
3525 urlh = compat_urllib_request.urlopen(url)
3526 webpage_bytes = urlh.read()
3527 webpage = webpage_bytes.decode('utf-8', 'ignore')
3528 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3529 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3532 response = json.loads(webpage)
3533 if type(response) != list:
3534 error_text = response.get('error', 'unknown error')
3535 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3538 for clip in response:
3539 video_url = clip['video_file_url']
3541 video_extension = os.path.splitext(video_url)[1][1:]
3542 video_date = re.sub('-', '', clip['start_time'][:10])
3543 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3547 'title': clip['title'],
3548 'uploader': clip.get('channel_name', video_uploader_id),
3549 'uploader_id': video_uploader_id,
3550 'upload_date': video_date,
3551 'ext': video_extension,
3553 return (len(response), info)
3555 def _real_extract(self, url):
3556 mobj = re.match(self._VALID_URL, url)
3558 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3561 api = 'http://api.justin.tv'
3562 video_id = mobj.group(mobj.lastindex)
3564 if mobj.lastindex == 1:
3566 api += '/channel/archives/%s.json'
3568 api += '/broadcast/by_archive/%s.json'
3569 api = api % (video_id,)
3571 self.report_extraction(video_id)
3575 limit = self._JUSTIN_PAGE_LIMIT
3578 self.report_download_page(video_id, offset)
3579 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3580 page_count, page_info = self._parse_page(page_url)
3581 info.extend(page_info)
3582 if not paged or page_count != limit:
3587 class FunnyOrDieIE(InfoExtractor):
3588 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3590 def _real_extract(self, url):
3591 mobj = re.match(self._VALID_URL, url)
3593 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3596 video_id = mobj.group('id')
3597 webpage = self._download_webpage(url, video_id)
3599 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3601 self._downloader.trouble(u'ERROR: unable to find video information')
3602 video_url = unescapeHTML(m.group('url'))
3604 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3606 self._downloader.trouble(u'Cannot find video title')
3607 title = unescapeHTML(m.group('title'))
3609 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3611 desc = unescapeHTML(m.group('desc'))
3620 'description': desc,
3624 class TweetReelIE(InfoExtractor):
3625 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3627 def _real_extract(self, url):
3628 mobj = re.match(self._VALID_URL, url)
3630 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3633 video_id = mobj.group('id')
3634 webpage = self._download_webpage(url, video_id)
3636 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3638 self._downloader.trouble(u'ERROR: Cannot find status ID')
3639 status_id = m.group(1)
3641 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3643 self._downloader.trouble(u'WARNING: Cannot find description')
3644 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3646 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3648 self._downloader.trouble(u'ERROR: Cannot find uploader')
3649 uploader = unescapeHTML(m.group('uploader'))
3650 uploader_id = unescapeHTML(m.group('uploader_id'))
3652 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3654 self._downloader.trouble(u'ERROR: Cannot find upload date')
3655 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3658 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3665 'description': desc,
3666 'uploader': uploader,
3667 'uploader_id': uploader_id,
3668 'internal_id': status_id,
3669 'upload_date': upload_date
3673 class SteamIE(InfoExtractor):
3674 _VALID_URL = r"""http://store.steampowered.com/
3675 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3677 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3680 def suitable(self, url):
3681 """Receives a URL and returns True if suitable for this IE."""
3682 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3684 def _real_extract(self, url):
3685 m = re.match(self._VALID_URL, url, re.VERBOSE)
3686 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3687 gameID = m.group('gameID')
3688 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3689 webpage = self._download_webpage(videourl, gameID)
3690 mweb = re.finditer(urlRE, webpage)
3691 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3692 titles = re.finditer(namesRE, webpage)
3694 for vid,vtitle in zip(mweb,titles):
3695 video_id = vid.group('videoID')
3696 title = vtitle.group('videoName')
3697 video_url = vid.group('videoURL')
3699 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3704 'title': unescapeHTML(title)
3709 class UstreamIE(InfoExtractor):
3710 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3711 IE_NAME = u'ustream'
3713 def _real_extract(self, url):
3714 m = re.match(self._VALID_URL, url)
3715 video_id = m.group('videoID')
3716 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3717 webpage = self._download_webpage(url, video_id)
3718 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3719 title = m.group('title')
3720 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3721 uploader = m.group('uploader')
3727 'uploader': uploader
3731 class RBMARadioIE(InfoExtractor):
3732 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3734 def _real_extract(self, url):
3735 m = re.match(self._VALID_URL, url)
3736 video_id = m.group('videoID')
3738 webpage = self._download_webpage(url, video_id)
3739 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3741 raise ExtractorError(u'Cannot find metadata')
3742 json_data = m.group(1)
3745 data = json.loads(json_data)
3746 except ValueError as e:
3747 raise ExtractorError(u'Invalid JSON: ' + str(e))
3749 video_url = data['akamai_url'] + '&cbr=256'
3750 url_parts = compat_urllib_parse_urlparse(video_url)
3751 video_ext = url_parts.path.rpartition('.')[2]
3756 'title': data['title'],
3757 'description': data.get('teaser_text'),
3758 'location': data.get('country_of_origin'),
3759 'uploader': data.get('host', {}).get('name'),
3760 'uploader_id': data.get('host', {}).get('slug'),
3761 'thumbnail': data.get('image').get('large_url_2x'),
3762 'duration': data.get('duration'),
3767 class YouPornIE(InfoExtractor):
3768 """Information extractor for youporn.com."""
3769 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3771 def _print_formats(self, formats):
3772 """Print all available formats"""
3773 print(u'Available formats:')
3774 print(u'ext\t\tformat')
3775 print(u'---------------------------------')
3776 for format in formats:
3777 print(u'%s\t\t%s' % (format['ext'], format['format']))
3779 def _specific(self, req_format, formats):
3781 if(x["format"]==req_format):
3785 def _real_extract(self, url):
3786 mobj = re.match(self._VALID_URL, url)
3788 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3791 video_id = mobj.group('videoid')
3793 req = compat_urllib_request.Request(url)
3794 req.add_header('Cookie', 'age_verified=1')
3795 webpage = self._download_webpage(req, video_id)
3797 # Get the video title
3798 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3800 raise ExtractorError(u'ERROR: unable to extract video title')
3801 video_title = result.group('title').strip()
3803 # Get the video date
3804 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3806 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3809 upload_date = result.group('date').strip()
3811 # Get the video uploader
3812 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3814 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3815 video_uploader = None
3817 video_uploader = result.group('uploader').strip()
3818 video_uploader = clean_html( video_uploader )
3820 # Get all of the formats available
3821 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3822 result = re.search(DOWNLOAD_LIST_RE, webpage)
3824 raise ExtractorError(u'Unable to extract download list')
3825 download_list_html = result.group('download_list').strip()
3827 # Get all of the links from the page
3828 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3829 links = re.findall(LINK_RE, download_list_html)
3830 if(len(links) == 0):
3831 raise ExtractorError(u'ERROR: no known formats available for video')
3833 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3838 # A link looks like this:
3839 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3840 # A path looks like this:
3841 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3842 video_url = unescapeHTML( link )
3843 path = compat_urllib_parse_urlparse( video_url ).path
3844 extension = os.path.splitext( path )[1][1:]
3845 format = path.split('/')[4].split('_')[:2]
3848 format = "-".join( format )
3849 title = u'%s-%s-%s' % (video_title, size, bitrate)
3854 'uploader': video_uploader,
3855 'upload_date': upload_date,
3860 'description': None,
3864 if self._downloader.params.get('listformats', None):
3865 self._print_formats(formats)
3868 req_format = self._downloader.params.get('format', None)
3869 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3871 if req_format is None or req_format == 'best':
3873 elif req_format == 'worst':
3874 return [formats[-1]]
3875 elif req_format in ('-1', 'all'):
3878 format = self._specific( req_format, formats )
3880 self._downloader.trouble(u'ERROR: requested format not available')
3886 class PornotubeIE(InfoExtractor):
3887 """Information extractor for pornotube.com."""
3888 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3890 def _real_extract(self, url):
3891 mobj = re.match(self._VALID_URL, url)
3893 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3896 video_id = mobj.group('videoid')
3897 video_title = mobj.group('title')
3899 # Get webpage content
3900 webpage = self._download_webpage(url, video_id)
3903 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3904 result = re.search(VIDEO_URL_RE, webpage)
3906 self._downloader.trouble(u'ERROR: unable to extract video url')
3908 video_url = compat_urllib_parse.unquote(result.group('url'))
3910 #Get the uploaded date
3911 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3912 result = re.search(VIDEO_UPLOADED_RE, webpage)
3914 self._downloader.trouble(u'ERROR: unable to extract video title')
3916 upload_date = result.group('date')
3918 info = {'id': video_id,
3921 'upload_date': upload_date,
3922 'title': video_title,
3930 class YouJizzIE(InfoExtractor):
3931 """Information extractor for youjizz.com."""
3932 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3934 def _real_extract(self, url):
3935 mobj = re.match(self._VALID_URL, url)
3937 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3940 video_id = mobj.group('videoid')
3942 # Get webpage content
3943 webpage = self._download_webpage(url, video_id)
3945 # Get the video title
3946 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3948 raise ExtractorError(u'ERROR: unable to extract video title')
3949 video_title = result.group('title').strip()
3951 # Get the embed page
3952 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3954 raise ExtractorError(u'ERROR: unable to extract embed page')
3956 embed_page_url = result.group(0).strip()
3957 video_id = result.group('videoid')
3959 webpage = self._download_webpage(embed_page_url, video_id)
3962 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3964 raise ExtractorError(u'ERROR: unable to extract video url')
3965 video_url = result.group('source')
3967 info = {'id': video_id,
3969 'title': video_title,
3972 'player_url': embed_page_url}
3977 def gen_extractors():
3978 """ Return a list of an instance of every supported extractor.
3979 The order does matter; the first extractor matched is the one handling the URL.
3982 YoutubePlaylistIE(),
4006 StanfordOpenClassroomIE(),