2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
52 The fields should all be Unicode strings.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
72 self.set_downloader(downloader)
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
79 """Getter method for _WORKING."""
83 """Initializes an instance (authentication, etc)."""
85 self._real_initialize()
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
91 return self._real_extract(url)
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
107 return type(self).__name__[:-2]
109 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
111 note = u'Downloading video webpage'
112 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
114 urlh = compat_urllib_request.urlopen(url_or_request)
115 webpage_bytes = urlh.read()
116 return webpage_bytes.decode('utf-8', 'replace')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 class YoutubeIE(InfoExtractor):
124 """Information extractor for youtube.com."""
128 (?:https?://)? # http(s):// (optional)
129 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
131 (?:.*?\#/)? # handle anchor (#/) redirect urls
132 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
133 (?: # the various things that can precede the ID:
134 (?:(?:v|embed|e)/) # v/ or embed/ or e/
135 |(?: # or the v= param in all its forms
136 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137 (?:\?|\#!?) # the params delimiter ? or # or #!
138 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
141 )? # optional -> youtube.com/xxxx is OK
142 )? # all until now is optional -> you can pass the naked ID
143 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
144 (?(1).+)? # if we found the ID, everything can follow
146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150 _NETRC_MACHINE = 'youtube'
151 # Listed in order of quality
152 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154 _video_extensions = {
160 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
166 _video_dimensions = {
184 def suitable(self, url):
185 """Receives a URL and returns True if suitable for this IE."""
186 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
188 def report_lang(self):
189 """Report attempt to set language."""
190 self._downloader.to_screen(u'[youtube] Setting language')
192 def report_login(self):
193 """Report attempt to log in."""
194 self._downloader.to_screen(u'[youtube] Logging in')
196 def report_age_confirmation(self):
197 """Report attempt to confirm age."""
198 self._downloader.to_screen(u'[youtube] Confirming age')
200 def report_video_webpage_download(self, video_id):
201 """Report attempt to download video webpage."""
202 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
204 def report_video_info_webpage_download(self, video_id):
205 """Report attempt to download video info webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
208 def report_video_subtitles_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
212 def report_information_extraction(self, video_id):
213 """Report attempt to extract video information."""
214 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
216 def report_unavailable_format(self, video_id, format):
217 """Report extracted video URL."""
218 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
220 def report_rtmp_download(self):
221 """Indicate the download will use the RTMP protocol."""
222 self._downloader.to_screen(u'[youtube] RTMP download detected')
224 def _closed_captions_xml_to_srt(self, xml_string):
226 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227 # TODO parse xml instead of regex
228 for n, (start, dur_tag, dur, caption) in enumerate(texts):
229 if not dur: dur = '4'
231 end = start + float(dur)
232 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234 caption = unescapeHTML(caption)
235 caption = unescapeHTML(caption) # double cycle, intentional
236 srt += str(n+1) + '\n'
237 srt += start + ' --> ' + end + '\n'
238 srt += caption + '\n\n'
241 def _extract_subtitles(self, video_id):
242 self.report_video_subtitles_download(video_id)
243 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
245 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250 if not srt_lang_list:
251 return (u'WARNING: video has no closed captions', None)
252 if self._downloader.params.get('subtitleslang', False):
253 srt_lang = self._downloader.params.get('subtitleslang')
254 elif 'en' in srt_lang_list:
257 srt_lang = list(srt_lang_list.keys())[0]
258 if not srt_lang in srt_lang_list:
259 return (u'WARNING: no closed captions found in the specified language', None)
260 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
262 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
266 return (u'WARNING: unable to download video subtitles', None)
267 return (None, self._closed_captions_xml_to_srt(srt_xml))
269 def _print_formats(self, formats):
270 print('Available formats:')
272 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
274 def _real_initialize(self):
275 if self._downloader is None:
280 downloader_params = self._downloader.params
282 # Attempt to use provided username and password or .netrc data
283 if downloader_params.get('username', None) is not None:
284 username = downloader_params['username']
285 password = downloader_params['password']
286 elif downloader_params.get('usenetrc', False):
288 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
293 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294 except (IOError, netrc.NetrcParseError) as err:
295 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
299 request = compat_urllib_request.Request(self._LANG_URL)
302 compat_urllib_request.urlopen(request).read()
303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
307 # No authentication to be performed
313 'current_form': 'loginForm',
315 'action_login': 'Log In',
316 'username': username,
317 'password': password,
319 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
322 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
333 'action_confirm': 'Confirm',
335 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
337 self.report_age_confirmation()
338 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
343 def _extract_id(self, url):
344 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
348 video_id = mobj.group(2)
351 def _real_extract(self, url):
352 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353 mobj = re.search(self._NEXT_URL_RE, url)
355 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356 video_id = self._extract_id(url)
359 self.report_video_webpage_download(video_id)
360 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361 request = compat_urllib_request.Request(url)
363 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
368 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
370 # Attempt to extract SWF player URL
371 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
373 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
378 self.report_video_info_webpage_download(video_id)
379 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381 % (video_id, el_type))
382 request = compat_urllib_request.Request(video_info_url)
384 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386 video_info = compat_parse_qs(video_info_webpage)
387 if 'token' in video_info:
389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
392 if 'token' not in video_info:
393 if 'reason' in video_info:
394 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
396 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
399 # Check for "rental" videos
400 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401 self._downloader.trouble(u'ERROR: "rental" videos not supported')
404 # Start extracting information
405 self.report_information_extraction(video_id)
408 if 'author' not in video_info:
409 self._downloader.trouble(u'ERROR: unable to extract uploader name')
411 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
414 video_uploader_id = None
415 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
417 video_uploader_id = mobj.group(1)
419 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
422 if 'title' not in video_info:
423 self._downloader.trouble(u'ERROR: unable to extract video title')
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
428 if 'thumbnail_url' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440 for expression in format_expressions:
442 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
447 video_description = get_element_by_id("eow-description", video_webpage)
448 if video_description:
449 video_description = clean_html(video_description)
451 video_description = ''
454 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
458 self._downloader.trouble(srt_error)
460 if 'length_seconds' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video duration')
464 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
467 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
469 # Decide which formats to download
470 req_format = self._downloader.params.get('format', None)
472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473 self.report_rtmp_download()
474 video_url_list = [(None, video_info['conn'][0])]
475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
481 format_limit = self._downloader.params.get('format_limit', None)
482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483 if format_limit is not None and format_limit in available_formats:
484 format_list = available_formats[available_formats.index(format_limit):]
486 format_list = available_formats
487 existing_formats = [x for x in format_list if x in url_map]
488 if len(existing_formats) == 0:
489 self._downloader.trouble(u'ERROR: no known formats available for video')
491 if self._downloader.params.get('listformats', None):
492 self._print_formats(existing_formats)
494 if req_format is None or req_format == 'best':
495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496 elif req_format == 'worst':
497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498 elif req_format in ('-1', 'all'):
499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
501 # Specific formats. We pick the first in a slash-delimeted sequence.
502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503 req_formats = req_format.split('/')
504 video_url_list = None
505 for rf in req_formats:
507 video_url_list = [(rf, url_map[rf])]
509 if video_url_list is None:
510 self._downloader.trouble(u'ERROR: requested format not available')
513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
517 for format_param, video_real_url in video_url_list:
519 video_extension = self._video_extensions.get(format_param, 'flv')
521 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522 self._video_dimensions.get(format_param, '???'))
526 'url': video_real_url,
527 'uploader': video_uploader,
528 'uploader_id': video_uploader_id,
529 'upload_date': upload_date,
530 'title': video_title,
531 'ext': video_extension,
532 'format': video_format,
533 'thumbnail': video_thumbnail,
534 'description': video_description,
535 'player_url': player_url,
536 'subtitles': video_subtitles,
537 'duration': video_duration
542 class MetacafeIE(InfoExtractor):
543 """Information Extractor for metacafe.com."""
545 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548 IE_NAME = u'metacafe'
550 def __init__(self, downloader=None):
551 InfoExtractor.__init__(self, downloader)
553 def report_disclaimer(self):
554 """Report disclaimer retrieval."""
555 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
557 def report_age_confirmation(self):
558 """Report attempt to confirm age."""
559 self._downloader.to_screen(u'[metacafe] Confirming age')
561 def report_download_webpage(self, video_id):
562 """Report webpage download."""
563 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
565 def report_extraction(self, video_id):
566 """Report information extraction."""
567 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
569 def _real_initialize(self):
570 # Retrieve disclaimer
571 request = compat_urllib_request.Request(self._DISCLAIMER)
573 self.report_disclaimer()
574 disclaimer = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
582 'submit': "Continue - I'm over 18",
584 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
586 self.report_age_confirmation()
587 disclaimer = compat_urllib_request.urlopen(request).read()
588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
592 def _real_extract(self, url):
593 # Extract id and simplified title from URL
594 mobj = re.match(self._VALID_URL, url)
596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
599 video_id = mobj.group(1)
601 # Check if video comes from YouTube
602 mobj2 = re.match(r'^yt-(.*)$', video_id)
603 if mobj2 is not None:
604 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
607 # Retrieve video webpage to extract further information
608 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
610 self.report_download_webpage(video_id)
611 webpage = compat_urllib_request.urlopen(request).read()
612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
616 # Extract URL, uploader and title from webpage
617 self.report_extraction(video_id)
618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
620 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621 video_extension = mediaURL[-3:]
623 # Extract gdaKey if available
624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
628 gdaKey = mobj.group(1)
629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
633 self._downloader.trouble(u'ERROR: unable to extract media URL')
635 vardict = compat_parse_qs(mobj.group(1))
636 if 'mediaData' not in vardict:
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = mobj.group(1).replace('\\/', '/')
644 video_extension = mediaURL[-3:]
645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = mobj.group(1).decode('utf-8')
653 mobj = re.search(r'submitter=(.*?);', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
669 class DailymotionIE(InfoExtractor):
670 """Information Extractor for Dailymotion"""
672 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673 IE_NAME = u'dailymotion'
675 def __init__(self, downloader=None):
676 InfoExtractor.__init__(self, downloader)
678 def report_extraction(self, video_id):
679 """Report information extraction."""
680 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
682 def _real_extract(self, url):
683 # Extract id and simplified title from URL
684 mobj = re.match(self._VALID_URL, url)
686 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
689 video_id = mobj.group(1).split('_')[0].split('?')[0]
691 video_extension = 'mp4'
693 # Retrieve video webpage to extract further information
694 request = compat_urllib_request.Request(url)
695 request.add_header('Cookie', 'family_filter=off')
696 webpage = self._download_webpage(request, video_id)
698 # Extract URL, uploader and title from webpage
699 self.report_extraction(video_id)
700 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract media URL')
704 flashvars = compat_urllib_parse.unquote(mobj.group(1))
706 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
709 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
715 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
717 self._downloader.trouble(u'ERROR: unable to extract video URL')
720 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
722 # TODO: support choosing qualities
724 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
726 self._downloader.trouble(u'ERROR: unable to extract title')
728 video_title = unescapeHTML(mobj.group('title'))
730 video_uploader = None
731 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
733 # lookin for official user
734 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
735 if mobj_official is None:
736 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
738 video_uploader = mobj_official.group(1)
740 video_uploader = mobj.group(1)
742 video_upload_date = None
743 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
745 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
750 'uploader': video_uploader,
751 'upload_date': video_upload_date,
752 'title': video_title,
753 'ext': video_extension,
757 class PhotobucketIE(InfoExtractor):
758 """Information extractor for photobucket.com."""
760 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
761 IE_NAME = u'photobucket'
763 def __init__(self, downloader=None):
764 InfoExtractor.__init__(self, downloader)
766 def report_download_webpage(self, video_id):
767 """Report webpage download."""
768 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
770 def report_extraction(self, video_id):
771 """Report information extraction."""
772 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
774 def _real_extract(self, url):
775 # Extract id from URL
776 mobj = re.match(self._VALID_URL, url)
778 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
781 video_id = mobj.group(1)
783 video_extension = 'flv'
785 # Retrieve video webpage to extract further information
786 request = compat_urllib_request.Request(url)
788 self.report_download_webpage(video_id)
789 webpage = compat_urllib_request.urlopen(request).read()
790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
794 # Extract URL, uploader, and title from webpage
795 self.report_extraction(video_id)
796 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
798 self._downloader.trouble(u'ERROR: unable to extract media URL')
800 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
804 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
806 self._downloader.trouble(u'ERROR: unable to extract title')
808 video_title = mobj.group(1).decode('utf-8')
810 video_uploader = mobj.group(2).decode('utf-8')
813 'id': video_id.decode('utf-8'),
814 'url': video_url.decode('utf-8'),
815 'uploader': video_uploader,
817 'title': video_title,
818 'ext': video_extension.decode('utf-8'),
822 class YahooIE(InfoExtractor):
823 """Information extractor for video.yahoo.com."""
826 # _VALID_URL matches all Yahoo! Video URLs
827 # _VPAGE_URL matches only the extractable '/watch/' URLs
828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
830 IE_NAME = u'video.yahoo'
832 def __init__(self, downloader=None):
833 InfoExtractor.__init__(self, downloader)
835 def report_download_webpage(self, video_id):
836 """Report webpage download."""
837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
839 def report_extraction(self, video_id):
840 """Report information extraction."""
841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
843 def _real_extract(self, url, new_video=True):
844 # Extract ID from URL
845 mobj = re.match(self._VALID_URL, url)
847 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
850 video_id = mobj.group(2)
851 video_extension = 'flv'
853 # Rewrite valid but non-extractable URLs as
854 # extractable English language /watch/ URLs
855 if re.match(self._VPAGE_URL, url) is None:
856 request = compat_urllib_request.Request(url)
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
863 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
865 self._downloader.trouble(u'ERROR: Unable to extract id field')
867 yahoo_id = mobj.group(1)
869 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
871 self._downloader.trouble(u'ERROR: Unable to extract vid field')
873 yahoo_vid = mobj.group(1)
875 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
876 return self._real_extract(url, new_video=False)
878 # Retrieve video webpage to extract further information
879 request = compat_urllib_request.Request(url)
881 self.report_download_webpage(video_id)
882 webpage = compat_urllib_request.urlopen(request).read()
883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
887 # Extract uploader and title from webpage
888 self.report_extraction(video_id)
889 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
891 self._downloader.trouble(u'ERROR: unable to extract video title')
893 video_title = mobj.group(1).decode('utf-8')
895 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
897 self._downloader.trouble(u'ERROR: unable to extract video uploader')
899 video_uploader = mobj.group(1).decode('utf-8')
901 # Extract video thumbnail
902 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
906 video_thumbnail = mobj.group(1).decode('utf-8')
908 # Extract video description
909 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video description')
913 video_description = mobj.group(1).decode('utf-8')
914 if not video_description:
915 video_description = 'No description available.'
917 # Extract video height and width
918 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video height')
922 yv_video_height = mobj.group(1)
924 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video width')
928 yv_video_width = mobj.group(1)
930 # Retrieve video playlist to extract media URL
931 # I'm not completely sure what all these options are, but we
932 # seem to need most of them, otherwise the server sends a 401.
933 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
934 yv_bitrate = '700' # according to Wikipedia this is hard-coded
935 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
936 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
937 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
939 self.report_download_webpage(video_id)
940 webpage = compat_urllib_request.urlopen(request).read()
941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
945 # Extract media URL from playlist XML
946 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
948 self._downloader.trouble(u'ERROR: Unable to extract media URL')
950 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
951 video_url = unescapeHTML(video_url)
954 'id': video_id.decode('utf-8'),
956 'uploader': video_uploader,
958 'title': video_title,
959 'ext': video_extension.decode('utf-8'),
960 'thumbnail': video_thumbnail.decode('utf-8'),
961 'description': video_description,
965 class VimeoIE(InfoExtractor):
966 """Information extractor for vimeo.com."""
968 # _VALID_URL matches Vimeo URLs
969 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
972 def __init__(self, downloader=None):
973 InfoExtractor.__init__(self, downloader)
975 def report_download_webpage(self, video_id):
976 """Report webpage download."""
977 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
979 def report_extraction(self, video_id):
980 """Report information extraction."""
981 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
983 def _real_extract(self, url, new_video=True):
984 # Extract ID from URL
985 mobj = re.match(self._VALID_URL, url)
987 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
990 video_id = mobj.group(1)
992 # Retrieve video webpage to extract further information
993 request = compat_urllib_request.Request(url, None, std_headers)
995 self.report_download_webpage(video_id)
996 webpage_bytes = compat_urllib_request.urlopen(request).read()
997 webpage = webpage_bytes.decode('utf-8')
998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1002 # Now we begin extracting as much information as we can from what we
1003 # retrieved. First we extract the information common to all extractors,
1004 # and latter we extract those that are Vimeo specific.
1005 self.report_extraction(video_id)
1007 # Extract the config JSON
1009 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010 config = json.loads(config)
1012 self._downloader.trouble(u'ERROR: unable to extract info section')
1016 video_title = config["video"]["title"]
1018 # Extract uploader and uploader_id
1019 video_uploader = config["video"]["owner"]["name"]
1020 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1022 # Extract video thumbnail
1023 video_thumbnail = config["video"]["thumbnail"]
1025 # Extract video description
1026 video_description = get_element_by_attribute("itemprop", "description", webpage)
1027 if video_description: video_description = clean_html(video_description)
1028 else: video_description = ''
1030 # Extract upload date
1031 video_upload_date = None
1032 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033 if mobj is not None:
1034 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1036 # Vimeo specific: extract request signature and timestamp
1037 sig = config['request']['signature']
1038 timestamp = config['request']['timestamp']
1040 # Vimeo specific: extract video codec and quality information
1041 # First consider quality, then codecs, then take everything
1042 # TODO bind to format param
1043 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044 files = { 'hd': [], 'sd': [], 'other': []}
1045 for codec_name, codec_extension in codecs:
1046 if codec_name in config["video"]["files"]:
1047 if 'hd' in config["video"]["files"][codec_name]:
1048 files['hd'].append((codec_name, codec_extension, 'hd'))
1049 elif 'sd' in config["video"]["files"][codec_name]:
1050 files['sd'].append((codec_name, codec_extension, 'sd'))
1052 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1054 for quality in ('hd', 'sd', 'other'):
1055 if len(files[quality]) > 0:
1056 video_quality = files[quality][0][2]
1057 video_codec = files[quality][0][0]
1058 video_extension = files[quality][0][1]
1059 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1062 self._downloader.trouble(u'ERROR: no known codec found')
1065 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071 'uploader': video_uploader,
1072 'uploader_id': video_uploader_id,
1073 'upload_date': video_upload_date,
1074 'title': video_title,
1075 'ext': video_extension,
1076 'thumbnail': video_thumbnail,
1077 'description': video_description,
1081 class ArteTvIE(InfoExtractor):
1082 """arte.tv information extractor."""
1084 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085 _LIVE_URL = r'index-[0-9]+\.html$'
1087 IE_NAME = u'arte.tv'
1089 def __init__(self, downloader=None):
1090 InfoExtractor.__init__(self, downloader)
1092 def report_download_webpage(self, video_id):
1093 """Report webpage download."""
1094 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1096 def report_extraction(self, video_id):
1097 """Report information extraction."""
1098 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1100 def fetch_webpage(self, url):
1101 request = compat_urllib_request.Request(url)
1103 self.report_download_webpage(url)
1104 webpage = compat_urllib_request.urlopen(request).read()
1105 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1108 except ValueError as err:
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1113 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114 page = self.fetch_webpage(url)
1115 mobj = re.search(regex, page, regexFlags)
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122 for (i, key, err) in matchTuples:
1123 if mobj.group(i) is None:
1124 self._downloader.trouble(err)
1127 info[key] = mobj.group(i)
1131 def extractLiveStream(self, url):
1132 video_lang = url.split('/')[-4]
1133 info = self.grep_webpage(
1135 r'src="(.*?/videothek_js.*?\.js)',
1138 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1141 http_host = url.split('/')[2]
1142 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143 info = self.grep_webpage(
1145 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146 '(http://.*?\.swf).*?' +
1150 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1151 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1155 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1157 def extractPlus7Stream(self, url):
1158 video_lang = url.split('/')[-3]
1159 info = self.grep_webpage(
1161 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1164 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1167 next_url = compat_urllib_parse.unquote(info.get('url'))
1168 info = self.grep_webpage(
1170 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1173 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1176 next_url = compat_urllib_parse.unquote(info.get('url'))
1178 info = self.grep_webpage(
1180 r'<video id="(.*?)".*?>.*?' +
1181 '<name>(.*?)</name>.*?' +
1182 '<dateVideo>(.*?)</dateVideo>.*?' +
1183 '<url quality="hd">(.*?)</url>',
1186 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1187 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1189 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1194 'id': info.get('id'),
1195 'url': compat_urllib_parse.unquote(info.get('url')),
1196 'uploader': u'arte.tv',
1197 'upload_date': info.get('date'),
1198 'title': info.get('title').decode('utf-8'),
1204 def _real_extract(self, url):
1205 video_id = url.split('/')[-1]
1206 self.report_extraction(video_id)
1208 if re.search(self._LIVE_URL, video_id) is not None:
1209 self.extractLiveStream(url)
1212 info = self.extractPlus7Stream(url)
1217 class GenericIE(InfoExtractor):
1218 """Generic last-resort information extractor."""
1221 IE_NAME = u'generic'
1223 def __init__(self, downloader=None):
1224 InfoExtractor.__init__(self, downloader)
1226 def report_download_webpage(self, video_id):
1227 """Report webpage download."""
1228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1231 def report_extraction(self, video_id):
1232 """Report information extraction."""
1233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1235 def report_following_redirect(self, new_url):
1236 """Report information extraction."""
1237 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1239 def _test_redirect(self, url):
1240 """Check if it is a redirect, like url shorteners, in case restart chain."""
1241 class HeadRequest(compat_urllib_request.Request):
1242 def get_method(self):
1245 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1247 Subclass the HTTPRedirectHandler to make it use our
1248 HeadRequest also on the redirected URL
1250 def redirect_request(self, req, fp, code, msg, headers, newurl):
1251 if code in (301, 302, 303, 307):
1252 newurl = newurl.replace(' ', '%20')
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return HeadRequest(newurl,
1257 origin_req_host=req.get_origin_req_host(),
1260 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1262 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1264 Fallback to GET if HEAD is not allowed (405 HTTP error)
1266 def http_error_405(self, req, fp, code, msg, headers):
1270 newheaders = dict((k,v) for k,v in req.headers.items()
1271 if k.lower() not in ("content-length", "content-type"))
1272 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1274 origin_req_host=req.get_origin_req_host(),
1278 opener = compat_urllib_request.OpenerDirector()
1279 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280 HTTPMethodFallback, HEADRedirectHandler,
1281 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282 opener.add_handler(handler())
1284 response = opener.open(HeadRequest(url))
1285 new_url = response.geturl()
1290 self.report_following_redirect(new_url)
1291 self._downloader.download([new_url])
1294 def _real_extract(self, url):
1295 if self._test_redirect(url): return
1297 video_id = url.split('/')[-1]
1298 request = compat_urllib_request.Request(url)
1300 self.report_download_webpage(video_id)
1301 webpage = compat_urllib_request.urlopen(request).read()
1302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1305 except ValueError as err:
1306 # since this is the last-resort InfoExtractor, if
1307 # this error is thrown, it'll be thrown here
1308 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1311 self.report_extraction(video_id)
1312 # Start with something easy: JW Player in SWFObject
1313 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1315 # Broaden the search a little bit
1316 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1318 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1321 # It's possible that one of the regexes
1322 # matched, but returned an empty group:
1323 if mobj.group(1) is None:
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1327 video_url = compat_urllib_parse.unquote(mobj.group(1))
1328 video_id = os.path.basename(video_url)
1330 # here's a fun little line of code for you:
1331 video_extension = os.path.splitext(video_id)[1][1:]
1332 video_id = os.path.splitext(video_id)[0]
1334 # it's tempting to parse this further, but you would
1335 # have to take into account all the variations like
1336 # Video Title - Site Name
1337 # Site Name | Video Title
1338 # Video Title - Tagline | Site Name
1339 # and so on and so forth; it's just not practical
1340 mobj = re.search(r'<title>(.*)</title>', webpage)
1342 self._downloader.trouble(u'ERROR: unable to extract title')
1344 video_title = mobj.group(1)
1346 # video uploader is domain name
1347 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1351 video_uploader = mobj.group(1)
1356 'uploader': video_uploader,
1357 'upload_date': None,
1358 'title': video_title,
1359 'ext': video_extension,
1363 class YoutubeSearchIE(InfoExtractor):
1364 """Information Extractor for YouTube search queries."""
1365 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367 _max_youtube_results = 1000
1368 IE_NAME = u'youtube:search'
1370 def __init__(self, downloader=None):
1371 InfoExtractor.__init__(self, downloader)
1373 def report_download_page(self, query, pagenum):
1374 """Report attempt to download search page with given number."""
1375 query = query.decode(preferredencoding())
1376 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1378 def _real_extract(self, query):
1379 mobj = re.match(self._VALID_URL, query)
1381 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1384 prefix, query = query.split(':')
1386 query = query.encode('utf-8')
1388 self._download_n_results(query, 1)
1390 elif prefix == 'all':
1391 self._download_n_results(query, self._max_youtube_results)
1397 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1399 elif n > self._max_youtube_results:
1400 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401 n = self._max_youtube_results
1402 self._download_n_results(query, n)
1404 except ValueError: # parsing prefix as integer fails
1405 self._download_n_results(query, 1)
1408 def _download_n_results(self, query, n):
1409 """Downloads a specified number of results for a query"""
1415 while (50 * pagenum) < limit:
1416 self.report_download_page(query, pagenum+1)
1417 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418 request = compat_urllib_request.Request(result_url)
1420 data = compat_urllib_request.urlopen(request).read()
1421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1424 api_response = json.loads(data)['data']
1426 new_ids = list(video['id'] for video in api_response['items'])
1427 video_ids += new_ids
1429 limit = min(n, api_response['totalItems'])
1432 if len(video_ids) > n:
1433 video_ids = video_ids[:n]
1434 for id in video_ids:
1435 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1439 class GoogleSearchIE(InfoExtractor):
1440 """Information Extractor for Google Video search queries."""
1441 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445 _max_google_results = 1000
1446 IE_NAME = u'video.google:search'
1448 def __init__(self, downloader=None):
1449 InfoExtractor.__init__(self, downloader)
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1459 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1462 prefix, query = query.split(':')
1464 query = query.encode('utf-8')
1466 self._download_n_results(query, 1)
1468 elif prefix == 'all':
1469 self._download_n_results(query, self._max_google_results)
1475 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1477 elif n > self._max_google_results:
1478 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479 n = self._max_google_results
1480 self._download_n_results(query, n)
1482 except ValueError: # parsing prefix as integer fails
1483 self._download_n_results(query, 1)
1486 def _download_n_results(self, query, n):
1487 """Downloads a specified number of results for a query"""
1493 self.report_download_page(query, pagenum)
1494 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495 request = compat_urllib_request.Request(result_url)
1497 page = compat_urllib_request.urlopen(request).read()
1498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1502 # Extract video identifiers
1503 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504 video_id = mobj.group(1)
1505 if video_id not in video_ids:
1506 video_ids.append(video_id)
1507 if len(video_ids) == n:
1508 # Specified n videos reached
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1513 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514 for id in video_ids:
1515 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518 pagenum = pagenum + 1
1521 class YahooSearchIE(InfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1525 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528 _MORE_PAGES_INDICATOR = r'\s*Next'
1529 _max_yahoo_results = 1000
1530 IE_NAME = u'video.yahoo:search'
1532 def __init__(self, downloader=None):
1533 InfoExtractor.__init__(self, downloader)
1535 def report_download_page(self, query, pagenum):
1536 """Report attempt to download playlist page with given number."""
1537 query = query.decode(preferredencoding())
1538 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1540 def _real_extract(self, query):
1541 mobj = re.match(self._VALID_URL, query)
1543 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1546 prefix, query = query.split(':')
1548 query = query.encode('utf-8')
1550 self._download_n_results(query, 1)
1552 elif prefix == 'all':
1553 self._download_n_results(query, self._max_yahoo_results)
1559 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1561 elif n > self._max_yahoo_results:
1562 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563 n = self._max_yahoo_results
1564 self._download_n_results(query, n)
1566 except ValueError: # parsing prefix as integer fails
1567 self._download_n_results(query, 1)
1570 def _download_n_results(self, query, n):
1571 """Downloads a specified number of results for a query"""
1574 already_seen = set()
1578 self.report_download_page(query, pagenum)
1579 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580 request = compat_urllib_request.Request(result_url)
1582 page = compat_urllib_request.urlopen(request).read()
1583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1587 # Extract video identifiers
1588 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589 video_id = mobj.group(1)
1590 if video_id not in already_seen:
1591 video_ids.append(video_id)
1592 already_seen.add(video_id)
1593 if len(video_ids) == n:
1594 # Specified n videos reached
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1599 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600 for id in video_ids:
1601 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604 pagenum = pagenum + 1
1607 class YoutubePlaylistIE(InfoExtractor):
1608 """Information Extractor for YouTube playlists."""
1610 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1613 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614 IE_NAME = u'youtube:playlist'
1616 def __init__(self, downloader=None):
1617 InfoExtractor.__init__(self, downloader)
1619 def report_download_page(self, playlist_id, pagenum):
1620 """Report attempt to download playlist page with given number."""
1621 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1623 def _real_extract(self, url):
1624 # Extract playlist id
1625 mobj = re.match(self._VALID_URL, url)
1627 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1631 if mobj.group(3) is not None:
1632 self._downloader.download([mobj.group(3)])
1635 # Download playlist pages
1636 # prefix is 'p' as default for playlists but there are other types that need extra care
1637 playlist_prefix = mobj.group(1)
1638 if playlist_prefix == 'a':
1639 playlist_access = 'artist'
1641 playlist_prefix = 'p'
1642 playlist_access = 'view_play_list'
1643 playlist_id = mobj.group(2)
1648 self.report_download_page(playlist_id, pagenum)
1649 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650 request = compat_urllib_request.Request(url)
1652 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1657 # Extract video identifiers
1659 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660 if mobj.group(1) not in ids_in_page:
1661 ids_in_page.append(mobj.group(1))
1662 video_ids.extend(ids_in_page)
1664 if self._MORE_PAGES_INDICATOR not in page:
1666 pagenum = pagenum + 1
1668 total = len(video_ids)
1670 playliststart = self._downloader.params.get('playliststart', 1) - 1
1671 playlistend = self._downloader.params.get('playlistend', -1)
1672 if playlistend == -1:
1673 video_ids = video_ids[playliststart:]
1675 video_ids = video_ids[playliststart:playlistend]
1677 if len(video_ids) == total:
1678 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1680 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1682 for id in video_ids:
1683 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1687 class YoutubeChannelIE(InfoExtractor):
1688 """Information Extractor for YouTube channels."""
1690 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693 IE_NAME = u'youtube:channel'
1695 def report_download_page(self, channel_id, pagenum):
1696 """Report attempt to download channel page with given number."""
1697 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1699 def _real_extract(self, url):
1700 # Extract channel id
1701 mobj = re.match(self._VALID_URL, url)
1703 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1706 # Download channel pages
1707 channel_id = mobj.group(1)
1712 self.report_download_page(channel_id, pagenum)
1713 url = self._TEMPLATE_URL % (channel_id, pagenum)
1714 request = compat_urllib_request.Request(url)
1716 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721 # Extract video identifiers
1723 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1728 if self._MORE_PAGES_INDICATOR not in page:
1730 pagenum = pagenum + 1
1732 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739 class YoutubeUserIE(InfoExtractor):
1740 """Information Extractor for YouTube users."""
1742 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744 _GDATA_PAGE_SIZE = 50
1745 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747 IE_NAME = u'youtube:user'
1749 def __init__(self, downloader=None):
1750 InfoExtractor.__init__(self, downloader)
1752 def report_download_page(self, username, start_index):
1753 """Report attempt to download user page."""
1754 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1757 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1761 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1764 username = mobj.group(1)
1766 # Download video ids using YouTube Data API. Result size per
1767 # query is limited (currently to 50 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1775 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776 self.report_download_page(username, start_index)
1778 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1781 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1786 # Extract video identifiers
1789 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(mobj.group(1))
1793 video_ids.extend(ids_in_page)
1795 # A little optimization - if current page is not
1796 # "full", ie. does not contain PAGE_SIZE video ids then
1797 # we can assume that this page is the last one - there
1798 # are no more ids on further pages - no need to query
1801 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1806 all_ids_count = len(video_ids)
1807 playliststart = self._downloader.params.get('playliststart', 1) - 1
1808 playlistend = self._downloader.params.get('playlistend', -1)
1810 if playlistend == -1:
1811 video_ids = video_ids[playliststart:]
1813 video_ids = video_ids[playliststart:playlistend]
1815 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816 (username, all_ids_count, len(video_ids)))
1818 for video_id in video_ids:
1819 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1822 class BlipTVUserIE(InfoExtractor):
1823 """Information Extractor for blip.tv users."""
1825 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1827 IE_NAME = u'blip.tv:user'
1829 def __init__(self, downloader=None):
1830 InfoExtractor.__init__(self, downloader)
1832 def report_download_page(self, username, pagenum):
1833 """Report attempt to download user page."""
1834 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835 (self.IE_NAME, username, pagenum))
1837 def _real_extract(self, url):
1839 mobj = re.match(self._VALID_URL, url)
1841 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1844 username = mobj.group(1)
1846 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1848 request = compat_urllib_request.Request(url)
1851 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852 mobj = re.search(r'data-users-id="([^"]+)"', page)
1853 page_base = page_base % mobj.group(1)
1854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1859 # Download video ids using BlipTV Ajax calls. Result size per
1860 # query is limited (currently to 12 videos) so we need to query
1861 # page by page until there are no video ids - it means we got
1868 self.report_download_page(username, pagenum)
1870 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1873 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1878 # Extract video identifiers
1881 for mobj in re.finditer(r'href="/([^"]+)"', page):
1882 if mobj.group(1) not in ids_in_page:
1883 ids_in_page.append(unescapeHTML(mobj.group(1)))
1885 video_ids.extend(ids_in_page)
1887 # A little optimization - if current page is not
1888 # "full", ie. does not contain PAGE_SIZE video ids then
1889 # we can assume that this page is the last one - there
1890 # are no more ids on further pages - no need to query
1893 if len(ids_in_page) < self._PAGE_SIZE:
1898 all_ids_count = len(video_ids)
1899 playliststart = self._downloader.params.get('playliststart', 1) - 1
1900 playlistend = self._downloader.params.get('playlistend', -1)
1902 if playlistend == -1:
1903 video_ids = video_ids[playliststart:]
1905 video_ids = video_ids[playliststart:playlistend]
1907 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1910 for video_id in video_ids:
1911 self._downloader.download([u'http://blip.tv/'+video_id])
1914 class DepositFilesIE(InfoExtractor):
1915 """Information extractor for depositfiles.com"""
1917 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1919 def report_download_webpage(self, file_id):
1920 """Report webpage download."""
1921 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1923 def report_extraction(self, file_id):
1924 """Report information extraction."""
1925 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1927 def _real_extract(self, url):
1928 file_id = url.split('/')[-1]
1929 # Rebuild url in english locale
1930 url = 'http://depositfiles.com/en/files/' + file_id
1932 # Retrieve file webpage with 'Free download' button pressed
1933 free_download_indication = { 'gateway_result' : '1' }
1934 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1936 self.report_download_webpage(file_id)
1937 webpage = compat_urllib_request.urlopen(request).read()
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1942 # Search for the real file URL
1943 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944 if (mobj is None) or (mobj.group(1) is None):
1945 # Try to figure out reason of the error.
1946 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947 if (mobj is not None) and (mobj.group(1) is not None):
1948 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1951 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1954 file_url = mobj.group(1)
1955 file_extension = os.path.splitext(file_url)[1][1:]
1957 # Search for file title
1958 mobj = re.search(r'<b title="(.*?)">', webpage)
1960 self._downloader.trouble(u'ERROR: unable to extract title')
1962 file_title = mobj.group(1).decode('utf-8')
1965 'id': file_id.decode('utf-8'),
1966 'url': file_url.decode('utf-8'),
1968 'upload_date': None,
1969 'title': file_title,
1970 'ext': file_extension.decode('utf-8'),
1974 class FacebookIE(InfoExtractor):
1975 """Information Extractor for Facebook"""
1978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980 _NETRC_MACHINE = 'facebook'
1981 _available_formats = ['video', 'highqual', 'lowqual']
1982 _video_extensions = {
1987 IE_NAME = u'facebook'
1989 def __init__(self, downloader=None):
1990 InfoExtractor.__init__(self, downloader)
1992 def _reporter(self, message):
1993 """Add header and report message."""
1994 self._downloader.to_screen(u'[facebook] %s' % message)
1996 def report_login(self):
1997 """Report attempt to log in."""
1998 self._reporter(u'Logging in')
2000 def report_video_webpage_download(self, video_id):
2001 """Report attempt to download video webpage."""
2002 self._reporter(u'%s: Downloading video webpage' % video_id)
2004 def report_information_extraction(self, video_id):
2005 """Report attempt to extract video information."""
2006 self._reporter(u'%s: Extracting video information' % video_id)
2008 def _parse_page(self, video_webpage):
2009 """Extract video information from page"""
2011 data = {'title': r'\("video_title", "(.*?)"\)',
2012 'description': r'<div class="datawrap">(.*?)</div>',
2013 'owner': r'\("video_owner_name", "(.*?)"\)',
2014 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2017 for piece in data.keys():
2018 mobj = re.search(data[piece], video_webpage)
2019 if mobj is not None:
2020 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024 for fmt in self._available_formats:
2025 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026 if mobj is not None:
2027 # URL is in a Javascript segment inside an escaped Unicode format within
2028 # the generally utf-8 page
2029 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030 video_info['video_urls'] = video_urls
2034 def _real_initialize(self):
2035 if self._downloader is None:
2040 downloader_params = self._downloader.params
2042 # Attempt to use provided username and password or .netrc data
2043 if downloader_params.get('username', None) is not None:
2044 useremail = downloader_params['username']
2045 password = downloader_params['password']
2046 elif downloader_params.get('usenetrc', False):
2048 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049 if info is not None:
2053 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054 except (IOError, netrc.NetrcParseError) as err:
2055 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2058 if useremail is None:
2067 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2070 login_results = compat_urllib_request.urlopen(request).read()
2071 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2078 def _real_extract(self, url):
2079 mobj = re.match(self._VALID_URL, url)
2081 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2083 video_id = mobj.group('ID')
2086 self.report_video_webpage_download(video_id)
2087 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2089 page = compat_urllib_request.urlopen(request)
2090 video_webpage = page.read()
2091 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2095 # Start extracting information
2096 self.report_information_extraction(video_id)
2098 # Extract information
2099 video_info = self._parse_page(video_webpage)
2102 if 'owner' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2105 video_uploader = video_info['owner']
2108 if 'title' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract video title')
2111 video_title = video_info['title']
2112 video_title = video_title.decode('utf-8')
2115 if 'thumbnail' not in video_info:
2116 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117 video_thumbnail = ''
2119 video_thumbnail = video_info['thumbnail']
2123 if 'upload_date' in video_info:
2124 upload_time = video_info['upload_date']
2125 timetuple = email.utils.parsedate_tz(upload_time)
2126 if timetuple is not None:
2128 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2133 video_description = video_info.get('description', 'No description available.')
2135 url_map = video_info['video_urls']
2137 # Decide which formats to download
2138 req_format = self._downloader.params.get('format', None)
2139 format_limit = self._downloader.params.get('format_limit', None)
2141 if format_limit is not None and format_limit in self._available_formats:
2142 format_list = self._available_formats[self._available_formats.index(format_limit):]
2144 format_list = self._available_formats
2145 existing_formats = [x for x in format_list if x in url_map]
2146 if len(existing_formats) == 0:
2147 self._downloader.trouble(u'ERROR: no known formats available for video')
2149 if req_format is None:
2150 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151 elif req_format == 'worst':
2152 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153 elif req_format == '-1':
2154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2157 if req_format not in url_map:
2158 self._downloader.trouble(u'ERROR: requested format not available')
2160 video_url_list = [(req_format, url_map[req_format])] # Specific format
2163 for format_param, video_real_url in video_url_list:
2165 video_extension = self._video_extensions.get(format_param, 'mp4')
2168 'id': video_id.decode('utf-8'),
2169 'url': video_real_url.decode('utf-8'),
2170 'uploader': video_uploader.decode('utf-8'),
2171 'upload_date': upload_date,
2172 'title': video_title,
2173 'ext': video_extension.decode('utf-8'),
2174 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2175 'thumbnail': video_thumbnail.decode('utf-8'),
2176 'description': video_description.decode('utf-8'),
2180 class BlipTVIE(InfoExtractor):
2181 """Information extractor for blip.tv"""
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185 IE_NAME = u'blip.tv'
2187 def report_extraction(self, file_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2191 def report_direct_download(self, title):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2195 def _real_extract(self, url):
2196 mobj = re.match(self._VALID_URL, url)
2198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2205 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206 request = compat_urllib_request.Request(json_url)
2207 self.report_extraction(mobj.group(1))
2210 urlh = compat_urllib_request.urlopen(request)
2211 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212 basename = url.split('/')[-1]
2213 title,ext = os.path.splitext(basename)
2214 title = title.decode('UTF-8')
2215 ext = ext.replace('.', '')
2216 self.report_direct_download(title)
2221 'upload_date': None,
2226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2229 if info is None: # Regular URL
2231 json_code_bytes = urlh.read()
2232 json_code = json_code_bytes.decode('utf-8')
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2238 json_data = json.loads(json_code)
2239 if 'Post' in json_data:
2240 data = json_data['Post']
2244 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245 video_url = data['media']['url']
2246 umobj = re.match(self._URL_EXT, video_url)
2248 raise ValueError('Can not determine filename extension')
2249 ext = umobj.group(1)
2252 'id': data['item_id'],
2254 'uploader': data['display_name'],
2255 'upload_date': upload_date,
2256 'title': data['title'],
2258 'format': data['media']['mimeType'],
2259 'thumbnail': data['thumbnailUrl'],
2260 'description': data['description'],
2261 'player_url': data['embedUrl']
2263 except (ValueError,KeyError) as err:
2264 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2267 std_headers['User-Agent'] = 'iTunes/10.6.1'
2271 class MyVideoIE(InfoExtractor):
2272 """Information Extractor for myvideo.de."""
2274 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275 IE_NAME = u'myvideo'
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2280 def report_extraction(self, video_id):
2281 """Report information extraction."""
2282 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2284 def _real_extract(self,url):
2285 mobj = re.match(self._VALID_URL, url)
2287 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2290 video_id = mobj.group(1)
2293 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294 webpage = self._download_webpage(webpage_url, video_id)
2296 self.report_extraction(video_id)
2297 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2300 self._downloader.trouble(u'ERROR: unable to extract media URL')
2302 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2304 mobj = re.search('<title>([^<]+)</title>', webpage)
2306 self._downloader.trouble(u'ERROR: unable to extract title')
2309 video_title = mobj.group(1)
2315 'upload_date': None,
2316 'title': video_title,
2320 class ComedyCentralIE(InfoExtractor):
2321 """Information extractor for The Daily Show and Colbert Report """
2323 # urls can be abbreviations like :thedailyshow or :colbert
2324 # urls for episodes like:
2325 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329 |(https?://)?(www\.)?
2330 (?P<showname>thedailyshow|colbertnation)\.com/
2331 (full-episodes/(?P<episode>.*)|
2333 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2336 IE_NAME = u'comedycentral'
2338 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2340 _video_extensions = {
2348 _video_dimensions = {
2357 def suitable(self, url):
2358 """Receives a URL and returns True if suitable for this IE."""
2359 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2361 def report_extraction(self, episode_id):
2362 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2364 def report_config_download(self, episode_id):
2365 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2367 def report_index_download(self, episode_id):
2368 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2370 def report_player_url(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2374 def _print_formats(self, formats):
2375 print('Available formats:')
2377 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2380 def _real_extract(self, url):
2381 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2383 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2386 if mobj.group('shortname'):
2387 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2388 url = u'http://www.thedailyshow.com/full-episodes/'
2390 url = u'http://www.colbertnation.com/full-episodes/'
2391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 assert mobj is not None
2394 if mobj.group('clip'):
2395 if mobj.group('showname') == 'thedailyshow':
2396 epTitle = mobj.group('tdstitle')
2398 epTitle = mobj.group('cntitle')
2401 dlNewest = not mobj.group('episode')
2403 epTitle = mobj.group('showname')
2405 epTitle = mobj.group('episode')
2407 req = compat_urllib_request.Request(url)
2408 self.report_extraction(epTitle)
2410 htmlHandle = compat_urllib_request.urlopen(req)
2411 html = htmlHandle.read()
2412 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2413 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2416 url = htmlHandle.geturl()
2417 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2419 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2421 if mobj.group('episode') == '':
2422 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2424 epTitle = mobj.group('episode')
2426 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2428 if len(mMovieParams) == 0:
2429 # The Colbert Report embeds the information in a without
2430 # a URL prefix; so extract the alternate reference
2431 # and then add the URL prefix manually.
2433 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2434 if len(altMovieParams) == 0:
2435 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2438 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2440 playerUrl_raw = mMovieParams[0][0]
2441 self.report_player_url(epTitle)
2443 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2444 playerUrl = urlHandle.geturl()
2445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2449 uri = mMovieParams[0][1]
2450 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2451 self.report_index_download(epTitle)
2453 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2460 idoc = xml.etree.ElementTree.fromstring(indexXml)
2461 itemEls = idoc.findall('.//item')
2462 for itemEl in itemEls:
2463 mediaId = itemEl.findall('./guid')[0].text
2464 shortMediaId = mediaId.split(':')[-1]
2465 showId = mediaId.split(':')[-2].replace('.com', '')
2466 officialTitle = itemEl.findall('./title')[0].text
2467 officialDate = itemEl.findall('./pubDate')[0].text
2469 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2470 compat_urllib_parse.urlencode({'uri': mediaId}))
2471 configReq = compat_urllib_request.Request(configUrl)
2472 self.report_config_download(epTitle)
2474 configXml = compat_urllib_request.urlopen(configReq).read()
2475 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2476 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2479 cdoc = xml.etree.ElementTree.fromstring(configXml)
2481 for rendition in cdoc.findall('.//rendition'):
2482 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2486 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2489 if self._downloader.params.get('listformats', None):
2490 self._print_formats([i[0] for i in turls])
2493 # For now, just pick the highest bitrate
2494 format,video_url = turls[-1]
2496 # Get the format arg from the arg stream
2497 req_format = self._downloader.params.get('format', None)
2499 # Select format if we can find one
2502 format, video_url = f, v
2505 # Patch to download from alternative CDN, which does not
2506 # break on current RTMPDump builds
2507 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2508 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2510 if video_url.startswith(broken_cdn):
2511 video_url = video_url.replace(broken_cdn, better_cdn)
2513 effTitle = showId + u'-' + epTitle
2518 'upload_date': officialDate,
2523 'description': officialTitle,
2524 'player_url': None #playerUrl
2527 results.append(info)
2532 class EscapistIE(InfoExtractor):
2533 """Information extractor for The Escapist """
2535 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2536 IE_NAME = u'escapist'
2538 def report_extraction(self, showName):
2539 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2541 def report_config_download(self, showName):
2542 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2544 def _real_extract(self, url):
2545 mobj = re.match(self._VALID_URL, url)
2547 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2549 showName = mobj.group('showname')
2550 videoId = mobj.group('episode')
2552 self.report_extraction(showName)
2554 webPage = compat_urllib_request.urlopen(url)
2555 webPageBytes = webPage.read()
2556 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2557 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2558 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2559 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2562 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2563 description = unescapeHTML(descMatch.group(1))
2564 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2565 imgUrl = unescapeHTML(imgMatch.group(1))
2566 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2567 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2568 configUrlMatch = re.search('config=(.*)$', playerUrl)
2569 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2571 self.report_config_download(showName)
2573 configJSON = compat_urllib_request.urlopen(configUrl)
2574 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2575 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2580 # Technically, it's JavaScript, not JSON
2581 configJSON = configJSON.replace("'", '"')
2584 config = json.loads(configJSON)
2585 except (ValueError,) as err:
2586 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2589 playlist = config['playlist']
2590 videoUrl = playlist[1]['url']
2595 'uploader': showName,
2596 'upload_date': None,
2599 'thumbnail': imgUrl,
2600 'description': description,
2601 'player_url': playerUrl,
2607 class CollegeHumorIE(InfoExtractor):
2608 """Information extractor for collegehumor.com"""
2611 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2612 IE_NAME = u'collegehumor'
2614 def report_manifest(self, video_id):
2615 """Report information extraction."""
2616 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2618 def report_extraction(self, video_id):
2619 """Report information extraction."""
2620 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2622 def _real_extract(self, url):
2623 mobj = re.match(self._VALID_URL, url)
2625 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2627 video_id = mobj.group('videoid')
2632 'upload_date': None,
2635 self.report_extraction(video_id)
2636 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2638 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2640 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2643 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2645 videoNode = mdoc.findall('./video')[0]
2646 info['description'] = videoNode.findall('./description')[0].text
2647 info['title'] = videoNode.findall('./caption')[0].text
2648 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2649 manifest_url = videoNode.findall('./file')[0].text
2651 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2654 manifest_url += '?hdcore=2.10.3'
2655 self.report_manifest(video_id)
2657 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2658 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2659 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2662 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2664 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2665 node_id = media_node.attrib['url']
2666 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2667 except IndexError as err:
2668 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2671 url_pr = compat_urllib_parse_urlparse(manifest_url)
2672 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2679 class XVideosIE(InfoExtractor):
2680 """Information extractor for xvideos.com"""
2682 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2683 IE_NAME = u'xvideos'
2685 def report_extraction(self, video_id):
2686 """Report information extraction."""
2687 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2689 def _real_extract(self, url):
2690 mobj = re.match(self._VALID_URL, url)
2692 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2694 video_id = mobj.group(1)
2696 webpage = self._download_webpage(url, video_id)
2698 self.report_extraction(video_id)
2702 mobj = re.search(r'flv_url=(.+?)&', webpage)
2704 self._downloader.trouble(u'ERROR: unable to extract video url')
2706 video_url = compat_urllib_parse.unquote(mobj.group(1))
2710 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2712 self._downloader.trouble(u'ERROR: unable to extract video title')
2714 video_title = mobj.group(1)
2717 # Extract video thumbnail
2718 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2720 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2722 video_thumbnail = mobj.group(0)
2728 'upload_date': None,
2729 'title': video_title,
2731 'thumbnail': video_thumbnail,
2732 'description': None,
2738 class SoundcloudIE(InfoExtractor):
2739 """Information extractor for soundcloud.com
2740 To access the media, the uid of the song and a stream token
2741 must be extracted from the page source and the script must make
2742 a request to media.soundcloud.com/crossdomain.xml. Then
2743 the media can be grabbed by requesting from an url composed
2744 of the stream token and uid
2747 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2748 IE_NAME = u'soundcloud'
2750 def __init__(self, downloader=None):
2751 InfoExtractor.__init__(self, downloader)
2753 def report_resolve(self, video_id):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2757 def report_extraction(self, video_id):
2758 """Report information extraction."""
2759 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2761 def _real_extract(self, url):
2762 mobj = re.match(self._VALID_URL, url)
2764 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2767 # extract uploader (which is in the url)
2768 uploader = mobj.group(1)
2769 # extract simple title (uploader + slug of song title)
2770 slug_title = mobj.group(2)
2771 simple_title = uploader + u'-' + slug_title
2773 self.report_resolve('%s/%s' % (uploader, slug_title))
2775 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2776 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777 request = compat_urllib_request.Request(resolv_url)
2779 info_json_bytes = compat_urllib_request.urlopen(request).read()
2780 info_json = info_json_bytes.decode('utf-8')
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2785 info = json.loads(info_json)
2786 video_id = info['id']
2787 self.report_extraction('%s/%s' % (uploader, slug_title))
2789 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2790 request = compat_urllib_request.Request(streams_url)
2792 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2793 stream_json = stream_json_bytes.decode('utf-8')
2794 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2795 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2798 streams = json.loads(stream_json)
2799 mediaURL = streams['http_mp3_128_url']
2804 'uploader': info['user']['username'],
2805 'upload_date': info['created_at'],
2806 'title': info['title'],
2808 'description': info['description'],
2812 class InfoQIE(InfoExtractor):
2813 """Information extractor for infoq.com"""
2814 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2816 def report_extraction(self, video_id):
2817 """Report information extraction."""
2818 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2820 def _real_extract(self, url):
2821 mobj = re.match(self._VALID_URL, url)
2823 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2826 webpage = self._download_webpage(url, video_id=url)
2827 self.report_extraction(url)
2830 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2832 self._downloader.trouble(u'ERROR: unable to extract video url')
2834 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2835 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2838 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2840 self._downloader.trouble(u'ERROR: unable to extract video title')
2842 video_title = mobj.group(1)
2844 # Extract description
2845 video_description = u'No description available.'
2846 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2847 if mobj is not None:
2848 video_description = mobj.group(1)
2850 video_filename = video_url.split('/')[-1]
2851 video_id, extension = video_filename.split('.')
2857 'upload_date': None,
2858 'title': video_title,
2859 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2861 'description': video_description,
2866 class MixcloudIE(InfoExtractor):
2867 """Information extractor for www.mixcloud.com"""
2869 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2870 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2871 IE_NAME = u'mixcloud'
2873 def __init__(self, downloader=None):
2874 InfoExtractor.__init__(self, downloader)
2876 def report_download_json(self, file_id):
2877 """Report JSON download."""
2878 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2880 def report_extraction(self, file_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2884 def get_urls(self, jsonData, fmt, bitrate='best'):
2885 """Get urls from 'audio_formats' section in json"""
2888 bitrate_list = jsonData[fmt]
2889 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2890 bitrate = max(bitrate_list) # select highest
2892 url_list = jsonData[fmt][bitrate]
2893 except TypeError: # we have no bitrate info.
2894 url_list = jsonData[fmt]
2897 def check_urls(self, url_list):
2898 """Returns 1st active url from list"""
2899 for url in url_list:
2901 compat_urllib_request.urlopen(url)
2903 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2908 def _print_formats(self, formats):
2909 print('Available formats:')
2910 for fmt in formats.keys():
2911 for b in formats[fmt]:
2913 ext = formats[fmt][b][0]
2914 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2915 except TypeError: # we have no bitrate info
2916 ext = formats[fmt][0]
2917 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2920 def _real_extract(self, url):
2921 mobj = re.match(self._VALID_URL, url)
2923 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2925 # extract uploader & filename from url
2926 uploader = mobj.group(1).decode('utf-8')
2927 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2929 # construct API request
2930 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2931 # retrieve .json file with links to files
2932 request = compat_urllib_request.Request(file_url)
2934 self.report_download_json(file_url)
2935 jsonData = compat_urllib_request.urlopen(request).read()
2936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2937 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2941 json_data = json.loads(jsonData)
2942 player_url = json_data['player_swf_url']
2943 formats = dict(json_data['audio_formats'])
2945 req_format = self._downloader.params.get('format', None)
2948 if self._downloader.params.get('listformats', None):
2949 self._print_formats(formats)
2952 if req_format is None or req_format == 'best':
2953 for format_param in formats.keys():
2954 url_list = self.get_urls(formats, format_param)
2956 file_url = self.check_urls(url_list)
2957 if file_url is not None:
2960 if req_format not in formats:
2961 self._downloader.trouble(u'ERROR: format is not available')
2964 url_list = self.get_urls(formats, req_format)
2965 file_url = self.check_urls(url_list)
2966 format_param = req_format
2969 'id': file_id.decode('utf-8'),
2970 'url': file_url.decode('utf-8'),
2971 'uploader': uploader.decode('utf-8'),
2972 'upload_date': None,
2973 'title': json_data['name'],
2974 'ext': file_url.split('.')[-1].decode('utf-8'),
2975 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2976 'thumbnail': json_data['thumbnail_url'],
2977 'description': json_data['description'],
2978 'player_url': player_url.decode('utf-8'),
2981 class StanfordOpenClassroomIE(InfoExtractor):
2982 """Information extractor for Stanford's Open ClassRoom"""
2984 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2985 IE_NAME = u'stanfordoc'
2987 def report_download_webpage(self, objid):
2988 """Report information extraction."""
2989 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2991 def report_extraction(self, video_id):
2992 """Report information extraction."""
2993 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2995 def _real_extract(self, url):
2996 mobj = re.match(self._VALID_URL, url)
2998 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3001 if mobj.group('course') and mobj.group('video'): # A specific video
3002 course = mobj.group('course')
3003 video = mobj.group('video')
3005 'id': course + '_' + video,
3007 'upload_date': None,
3010 self.report_extraction(info['id'])
3011 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3012 xmlUrl = baseUrl + video + '.xml'
3014 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3015 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3016 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3018 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3020 info['title'] = mdoc.findall('./title')[0].text
3021 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3023 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3025 info['ext'] = info['url'].rpartition('.')[2]
3027 elif mobj.group('course'): # A course page
3028 course = mobj.group('course')
3033 'upload_date': None,
3036 self.report_download_webpage(info['id'])
3038 coursepage = compat_urllib_request.urlopen(url).read()
3039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3040 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3043 m = re.search('<h1>([^<]+)</h1>', coursepage)
3045 info['title'] = unescapeHTML(m.group(1))
3047 info['title'] = info['id']
3049 m = re.search('<description>([^<]+)</description>', coursepage)
3051 info['description'] = unescapeHTML(m.group(1))
3053 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3056 'type': 'reference',
3057 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3061 for entry in info['list']:
3062 assert entry['type'] == 'reference'
3063 results += self.extract(entry['url'])
3068 'id': 'Stanford OpenClassroom',
3071 'upload_date': None,
3074 self.report_download_webpage(info['id'])
3075 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3077 rootpage = compat_urllib_request.urlopen(rootURL).read()
3078 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3079 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3082 info['title'] = info['id']
3084 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3087 'type': 'reference',
3088 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3093 for entry in info['list']:
3094 assert entry['type'] == 'reference'
3095 results += self.extract(entry['url'])
3098 class MTVIE(InfoExtractor):
3099 """Information extractor for MTV.com"""
3101 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3104 def report_extraction(self, video_id):
3105 """Report information extraction."""
3106 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3108 def _real_extract(self, url):
3109 mobj = re.match(self._VALID_URL, url)
3111 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3113 if not mobj.group('proto'):
3114 url = 'http://' + url
3115 video_id = mobj.group('videoid')
3117 webpage = self._download_webpage(url, video_id)
3119 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3121 self._downloader.trouble(u'ERROR: unable to extract song name')
3123 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3124 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3126 self._downloader.trouble(u'ERROR: unable to extract performer')
3128 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3129 video_title = performer + ' - ' + song_name
3131 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3133 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3135 mtvn_uri = mobj.group(1)
3137 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3139 self._downloader.trouble(u'ERROR: unable to extract content id')
3141 content_id = mobj.group(1)
3143 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3144 self.report_extraction(video_id)
3145 request = compat_urllib_request.Request(videogen_url)
3147 metadataXml = compat_urllib_request.urlopen(request).read()
3148 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3149 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3152 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3153 renditions = mdoc.findall('.//rendition')
3155 # For now, always pick the highest quality.
3156 rendition = renditions[-1]
3159 _,_,ext = rendition.attrib['type'].partition('/')
3160 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3161 video_url = rendition.find('./src').text
3163 self._downloader.trouble('Invalid rendition field.')
3169 'uploader': performer,
3170 'upload_date': None,
3171 'title': video_title,
3179 class YoukuIE(InfoExtractor):
3180 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3182 def report_download_webpage(self, file_id):
3183 """Report webpage download."""
3184 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3186 def report_extraction(self, file_id):
3187 """Report information extraction."""
3188 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3191 nowTime = int(time.time() * 1000)
3192 random1 = random.randint(1000,1998)
3193 random2 = random.randint(1000,9999)
3195 return "%d%d%d" %(nowTime,random1,random2)
3197 def _get_file_ID_mix_string(self, seed):
3199 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3201 for i in range(len(source)):
3202 seed = (seed * 211 + 30031 ) % 65536
3203 index = math.floor(seed / 65536 * len(source) )
3204 mixed.append(source[int(index)])
3205 source.remove(source[int(index)])
3206 #return ''.join(mixed)
3209 def _get_file_id(self, fileId, seed):
3210 mixed = self._get_file_ID_mix_string(seed)
3211 ids = fileId.split('*')
3215 realId.append(mixed[int(ch)])
3216 return ''.join(realId)
3218 def _real_extract(self, url):
3219 mobj = re.match(self._VALID_URL, url)
3221 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3223 video_id = mobj.group('ID')
3225 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3227 request = compat_urllib_request.Request(info_url, None, std_headers)
3229 self.report_download_webpage(video_id)
3230 jsondata = compat_urllib_request.urlopen(request).read()
3231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3232 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3235 self.report_extraction(video_id)
3237 jsonstr = jsondata.decode('utf-8')
3238 config = json.loads(jsonstr)
3240 video_title = config['data'][0]['title']
3241 seed = config['data'][0]['seed']
3243 format = self._downloader.params.get('format', None)
3244 supported_format = list(config['data'][0]['streamfileids'].keys())
3246 if format is None or format == 'best':
3247 if 'hd2' in supported_format:
3252 elif format == 'worst':
3260 fileid = config['data'][0]['streamfileids'][format]
3261 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3262 except (UnicodeDecodeError, ValueError, KeyError):
3263 self._downloader.trouble(u'ERROR: unable to extract info section')
3267 sid = self._gen_sid()
3268 fileid = self._get_file_id(fileid, seed)
3270 #column 8,9 of fileid represent the segment number
3271 #fileid[7:9] should be changed
3272 for index, key in enumerate(keys):
3274 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3275 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3278 'id': '%s_part%02d' % (video_id, index),
3279 'url': download_url,
3281 'upload_date': None,
3282 'title': video_title,
3285 files_info.append(info)
3290 class XNXXIE(InfoExtractor):
3291 """Information extractor for xnxx.com"""
3293 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3295 VIDEO_URL_RE = r'flv_url=(.*?)&'
3296 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3297 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3299 def report_webpage(self, video_id):
3300 """Report information extraction"""
3301 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3303 def report_extraction(self, video_id):
3304 """Report information extraction"""
3305 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3307 def _real_extract(self, url):
3308 mobj = re.match(self._VALID_URL, url)
3310 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3312 video_id = mobj.group(1)
3314 self.report_webpage(video_id)
3316 # Get webpage content
3318 webpage_bytes = compat_urllib_request.urlopen(url).read()
3319 webpage = webpage_bytes.decode('utf-8')
3320 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3324 result = re.search(self.VIDEO_URL_RE, webpage)
3326 self._downloader.trouble(u'ERROR: unable to extract video url')
3328 video_url = compat_urllib_parse.unquote(result.group(1))
3330 result = re.search(self.VIDEO_TITLE_RE, webpage)
3332 self._downloader.trouble(u'ERROR: unable to extract video title')
3334 video_title = result.group(1)
3336 result = re.search(self.VIDEO_THUMB_RE, webpage)
3338 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3340 video_thumbnail = result.group(1)
3346 'upload_date': None,
3347 'title': video_title,
3349 'thumbnail': video_thumbnail,
3350 'description': None,
3354 class GooglePlusIE(InfoExtractor):
3355 """Information extractor for plus.google.com."""
3357 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3358 IE_NAME = u'plus.google'
3360 def __init__(self, downloader=None):
3361 InfoExtractor.__init__(self, downloader)
3363 def report_extract_entry(self, url):
3364 """Report downloading extry"""
3365 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3367 def report_date(self, upload_date):
3368 """Report downloading extry"""
3369 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3371 def report_uploader(self, uploader):
3372 """Report downloading extry"""
3373 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3375 def report_title(self, video_title):
3376 """Report downloading extry"""
3377 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3379 def report_extract_vid_page(self, video_page):
3380 """Report information extraction."""
3381 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3383 def _real_extract(self, url):
3384 # Extract id from URL
3385 mobj = re.match(self._VALID_URL, url)
3387 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3390 post_url = mobj.group(0)
3391 video_id = mobj.group(1)
3393 video_extension = 'flv'
3395 # Step 1, Retrieve post webpage to extract further information
3396 self.report_extract_entry(post_url)
3397 request = compat_urllib_request.Request(post_url)
3399 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3400 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3401 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3404 # Extract update date
3406 pattern = 'title="Timestamp">(.*?)</a>'
3407 mobj = re.search(pattern, webpage)
3409 upload_date = mobj.group(1)
3410 # Convert timestring to a format suitable for filename
3411 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3412 upload_date = upload_date.strftime('%Y%m%d')
3413 self.report_date(upload_date)
3417 pattern = r'rel\="author".*?>(.*?)</a>'
3418 mobj = re.search(pattern, webpage)
3420 uploader = mobj.group(1)
3421 self.report_uploader(uploader)
3424 # Get the first line for title
3426 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3427 mobj = re.search(pattern, webpage)
3429 video_title = mobj.group(1)
3430 self.report_title(video_title)
3432 # Step 2, Stimulate clicking the image box to launch video
3433 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3434 mobj = re.search(pattern, webpage)
3436 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3438 video_page = mobj.group(1)
3439 request = compat_urllib_request.Request(video_page)
3441 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3442 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3443 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3445 self.report_extract_vid_page(video_page)
3448 # Extract video links on video page
3449 """Extract video links of all sizes"""
3450 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3451 mobj = re.findall(pattern, webpage)
3453 self._downloader.trouble(u'ERROR: unable to extract video links')
3455 # Sort in resolution
3456 links = sorted(mobj)
3458 # Choose the lowest of the sort, i.e. highest resolution
3459 video_url = links[-1]
3460 # Only get the url. The resolution part in the tuple has no use anymore
3461 video_url = video_url[-1]
3462 # Treat escaped \u0026 style hex
3464 video_url = video_url.decode("unicode_escape")
3465 except AttributeError: # Python 3
3466 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3472 'uploader': uploader,
3473 'upload_date': upload_date,
3474 'title': video_title,
3475 'ext': video_extension,
3478 class NBAIE(InfoExtractor):
3479 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3482 def _real_extract(self, url):
3483 mobj = re.match(self._VALID_URL, url)
3485 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3488 video_id = mobj.group(1)
3489 if video_id.endswith('/index.html'):
3490 video_id = video_id[:-len('/index.html')]
3492 webpage = self._download_webpage(url, video_id)
3494 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3495 def _findProp(rexp, default=None):
3496 m = re.search(rexp, webpage)
3498 return unescapeHTML(m.group(1))
3502 shortened_video_id = video_id.rpartition('/')[2]
3503 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3505 'id': shortened_video_id,
3509 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3510 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3514 class JustinTVIE(InfoExtractor):
3515 """Information extractor for justin.tv and twitch.tv"""
3516 # TODO: One broadcast may be split into multiple videos. The key
3517 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3518 # starts at 1 and increases. Can we treat all parts as one video?
3520 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3521 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3522 _JUSTIN_PAGE_LIMIT = 100
3523 IE_NAME = u'justin.tv'
3525 def report_extraction(self, file_id):
3526 """Report information extraction."""
3527 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3529 def report_download_page(self, channel, offset):
3530 """Report attempt to download a single page of videos."""
3531 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3532 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3534 # Return count of items, list of *valid* items
3535 def _parse_page(self, url):
3537 urlh = compat_urllib_request.urlopen(url)
3538 webpage_bytes = urlh.read()
3539 webpage = webpage_bytes.decode('utf-8', 'ignore')
3540 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3541 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3544 response = json.loads(webpage)
3546 for clip in response:
3547 video_url = clip['video_file_url']
3549 video_extension = os.path.splitext(video_url)[1][1:]
3550 video_date = re.sub('-', '', clip['created_on'][:10])
3554 'title': clip['title'],
3555 'uploader': clip.get('user_id', clip.get('channel_id')),
3556 'upload_date': video_date,
3557 'ext': video_extension,
3559 return (len(response), info)
3561 def _real_extract(self, url):
3562 mobj = re.match(self._VALID_URL, url)
3564 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3567 api = 'http://api.justin.tv'
3568 video_id = mobj.group(mobj.lastindex)
3570 if mobj.lastindex == 1:
3572 api += '/channel/archives/%s.json'
3574 api += '/clip/show/%s.json'
3575 api = api % (video_id,)
3577 self.report_extraction(video_id)
3581 limit = self._JUSTIN_PAGE_LIMIT
3584 self.report_download_page(video_id, offset)
3585 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3586 page_count, page_info = self._parse_page(page_url)
3587 info.extend(page_info)
3588 if not paged or page_count != limit:
3593 class FunnyOrDieIE(InfoExtractor):
3594 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3596 def _real_extract(self, url):
3597 mobj = re.match(self._VALID_URL, url)
3599 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3602 video_id = mobj.group('id')
3603 webpage = self._download_webpage(url, video_id)
3605 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3607 self._downloader.trouble(u'ERROR: unable to find video information')
3608 video_url = unescapeHTML(m.group('url'))
3610 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3612 self._downloader.trouble(u'Cannot find video title')
3613 title = unescapeHTML(m.group('title'))
3615 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3617 desc = unescapeHTML(m.group('desc'))
3626 'description': desc,
3630 class TweetReelIE(InfoExtractor):
3631 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3633 def _real_extract(self, url):
3634 mobj = re.match(self._VALID_URL, url)
3636 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3639 video_id = mobj.group('id')
3640 webpage = self._download_webpage(url, video_id)
3642 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3644 self._downloader.trouble(u'ERROR: Cannot find status ID')
3645 status_id = m.group(1)
3647 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3649 self._downloader.trouble(u'WARNING: Cannot find description')
3650 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3652 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3654 self._downloader.trouble(u'ERROR: Cannot find uploader')
3655 uploader = unescapeHTML(m.group('uploader'))
3656 uploader_id = unescapeHTML(m.group('uploader_id'))
3658 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3660 self._downloader.trouble(u'ERROR: Cannot find upload date')
3661 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3664 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3671 'description': desc,
3672 'uploader': uploader,
3673 'uploader_id': uploader_id,
3674 'internal_id': status_id,
3675 'upload_date': upload_date
3679 class SteamIE(InfoExtractor):
3680 _VALID_URL = r"""http://store.steampowered.com/
3681 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3683 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3686 def suitable(self, url):
3687 """Receives a URL and returns True if suitable for this IE."""
3688 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3690 def _real_extract(self, url):
3691 m = re.match(self._VALID_URL, url, re.VERBOSE)
3692 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3693 gameID = m.group('gameID')
3694 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3695 webpage = self._download_webpage(videourl, gameID)
3696 mweb = re.finditer(urlRE, webpage)
3697 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3698 titles = list(re.finditer(namesRE, webpage))
3700 for vid,vtitle in zip(mweb,titles):
3701 video_id = vid.group('videoID')
3702 title = vtitle.group('videoName')
3703 video_url = vid.group('videoURL')
3705 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3715 class UstreamIE(InfoExtractor):
3716 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3717 IE_NAME = u'ustream'
3719 def _real_extract(self, url):
3720 m = re.match(self._VALID_URL, url)
3721 video_id = m.group('videoID')
3722 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3723 webpage = self._download_webpage(url, video_id)
3724 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3725 title = m.group('title')
3726 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3727 uploader = m.group('uploader')
3733 'uploader': uploader
3738 def gen_extractors():
3739 """ Return a list of an instance of every supported extractor.
3740 The order does matter; the first extractor matched is the one handling the URL.
3743 YoutubePlaylistIE(),
3767 StanfordOpenClassroomIE(),