2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
52 The fields should all be Unicode strings.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
72 self.set_downloader(downloader)
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
79 """Getter method for _WORKING."""
83 """Initializes an instance (authentication, etc)."""
85 self._real_initialize()
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
91 return self._real_extract(url)
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
107 return type(self).__name__[:-2]
109 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
111 note = u'Downloading video webpage'
112 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
114 urlh = compat_urllib_request.urlopen(url_or_request)
115 webpage_bytes = urlh.read()
116 return webpage_bytes.decode('utf-8', 'replace')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 class YoutubeIE(InfoExtractor):
124 """Information extractor for youtube.com."""
128 (?:https?://)? # http(s):// (optional)
129 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
131 (?:.*?\#/)? # handle anchor (#/) redirect urls
132 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
133 (?: # the various things that can precede the ID:
134 (?:(?:v|embed|e)/) # v/ or embed/ or e/
135 |(?: # or the v= param in all its forms
136 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137 (?:\?|\#!?) # the params delimiter ? or # or #!
138 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
141 )? # optional -> youtube.com/xxxx is OK
142 )? # all until now is optional -> you can pass the naked ID
143 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
144 (?(1).+)? # if we found the ID, everything can follow
146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150 _NETRC_MACHINE = 'youtube'
151 # Listed in order of quality
152 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154 _video_extensions = {
160 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
166 _video_dimensions = {
184 def suitable(self, url):
185 """Receives a URL and returns True if suitable for this IE."""
186 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
188 def report_lang(self):
189 """Report attempt to set language."""
190 self._downloader.to_screen(u'[youtube] Setting language')
192 def report_login(self):
193 """Report attempt to log in."""
194 self._downloader.to_screen(u'[youtube] Logging in')
196 def report_age_confirmation(self):
197 """Report attempt to confirm age."""
198 self._downloader.to_screen(u'[youtube] Confirming age')
200 def report_video_webpage_download(self, video_id):
201 """Report attempt to download video webpage."""
202 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
204 def report_video_info_webpage_download(self, video_id):
205 """Report attempt to download video info webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
208 def report_video_subtitles_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
212 def report_information_extraction(self, video_id):
213 """Report attempt to extract video information."""
214 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
216 def report_unavailable_format(self, video_id, format):
217 """Report extracted video URL."""
218 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
220 def report_rtmp_download(self):
221 """Indicate the download will use the RTMP protocol."""
222 self._downloader.to_screen(u'[youtube] RTMP download detected')
224 def _closed_captions_xml_to_srt(self, xml_string):
226 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227 # TODO parse xml instead of regex
228 for n, (start, dur_tag, dur, caption) in enumerate(texts):
229 if not dur: dur = '4'
231 end = start + float(dur)
232 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234 caption = unescapeHTML(caption)
235 caption = unescapeHTML(caption) # double cycle, intentional
236 srt += str(n+1) + '\n'
237 srt += start + ' --> ' + end + '\n'
238 srt += caption + '\n\n'
241 def _extract_subtitles(self, video_id):
242 self.report_video_subtitles_download(video_id)
243 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
245 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250 if not srt_lang_list:
251 return (u'WARNING: video has no closed captions', None)
252 if self._downloader.params.get('subtitleslang', False):
253 srt_lang = self._downloader.params.get('subtitleslang')
254 elif 'en' in srt_lang_list:
257 srt_lang = list(srt_lang_list.keys())[0]
258 if not srt_lang in srt_lang_list:
259 return (u'WARNING: no closed captions found in the specified language', None)
260 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
262 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
266 return (u'WARNING: unable to download video subtitles', None)
267 return (None, self._closed_captions_xml_to_srt(srt_xml))
269 def _print_formats(self, formats):
270 print('Available formats:')
272 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
274 def _real_initialize(self):
275 if self._downloader is None:
280 downloader_params = self._downloader.params
282 # Attempt to use provided username and password or .netrc data
283 if downloader_params.get('username', None) is not None:
284 username = downloader_params['username']
285 password = downloader_params['password']
286 elif downloader_params.get('usenetrc', False):
288 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
293 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294 except (IOError, netrc.NetrcParseError) as err:
295 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
299 request = compat_urllib_request.Request(self._LANG_URL)
302 compat_urllib_request.urlopen(request).read()
303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
307 # No authentication to be performed
313 'current_form': 'loginForm',
315 'action_login': 'Log In',
316 'username': username,
317 'password': password,
319 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
322 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
333 'action_confirm': 'Confirm',
335 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
337 self.report_age_confirmation()
338 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
343 def _extract_id(self, url):
344 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
348 video_id = mobj.group(2)
351 def _real_extract(self, url):
352 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353 mobj = re.search(self._NEXT_URL_RE, url)
355 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356 video_id = self._extract_id(url)
359 self.report_video_webpage_download(video_id)
360 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361 request = compat_urllib_request.Request(url)
363 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
368 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
370 # Attempt to extract SWF player URL
371 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
373 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
378 self.report_video_info_webpage_download(video_id)
379 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381 % (video_id, el_type))
382 request = compat_urllib_request.Request(video_info_url)
384 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386 video_info = compat_parse_qs(video_info_webpage)
387 if 'token' in video_info:
389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
392 if 'token' not in video_info:
393 if 'reason' in video_info:
394 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
396 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
399 # Check for "rental" videos
400 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401 self._downloader.trouble(u'ERROR: "rental" videos not supported')
404 # Start extracting information
405 self.report_information_extraction(video_id)
408 if 'author' not in video_info:
409 self._downloader.trouble(u'ERROR: unable to extract uploader name')
411 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
414 video_uploader_id = None
415 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
417 video_uploader_id = mobj.group(1)
419 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
422 if 'title' not in video_info:
423 self._downloader.trouble(u'ERROR: unable to extract video title')
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
428 if 'thumbnail_url' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440 for expression in format_expressions:
442 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
447 video_description = get_element_by_id("eow-description", video_webpage)
448 if video_description:
449 video_description = clean_html(video_description)
451 video_description = ''
454 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
458 self._downloader.trouble(srt_error)
460 if 'length_seconds' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video duration')
464 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
467 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
469 # Decide which formats to download
470 req_format = self._downloader.params.get('format', None)
472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473 self.report_rtmp_download()
474 video_url_list = [(None, video_info['conn'][0])]
475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
481 format_limit = self._downloader.params.get('format_limit', None)
482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483 if format_limit is not None and format_limit in available_formats:
484 format_list = available_formats[available_formats.index(format_limit):]
486 format_list = available_formats
487 existing_formats = [x for x in format_list if x in url_map]
488 if len(existing_formats) == 0:
489 self._downloader.trouble(u'ERROR: no known formats available for video')
491 if self._downloader.params.get('listformats', None):
492 self._print_formats(existing_formats)
494 if req_format is None or req_format == 'best':
495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496 elif req_format == 'worst':
497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498 elif req_format in ('-1', 'all'):
499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
501 # Specific formats. We pick the first in a slash-delimeted sequence.
502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503 req_formats = req_format.split('/')
504 video_url_list = None
505 for rf in req_formats:
507 video_url_list = [(rf, url_map[rf])]
509 if video_url_list is None:
510 self._downloader.trouble(u'ERROR: requested format not available')
513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
517 for format_param, video_real_url in video_url_list:
519 video_extension = self._video_extensions.get(format_param, 'flv')
521 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522 self._video_dimensions.get(format_param, '???'))
526 'url': video_real_url,
527 'uploader': video_uploader,
528 'uploader_id': video_uploader_id,
529 'upload_date': upload_date,
530 'title': video_title,
531 'ext': video_extension,
532 'format': video_format,
533 'thumbnail': video_thumbnail,
534 'description': video_description,
535 'player_url': player_url,
536 'subtitles': video_subtitles,
537 'duration': video_duration
542 class MetacafeIE(InfoExtractor):
543 """Information Extractor for metacafe.com."""
545 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548 IE_NAME = u'metacafe'
550 def __init__(self, downloader=None):
551 InfoExtractor.__init__(self, downloader)
553 def report_disclaimer(self):
554 """Report disclaimer retrieval."""
555 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
557 def report_age_confirmation(self):
558 """Report attempt to confirm age."""
559 self._downloader.to_screen(u'[metacafe] Confirming age')
561 def report_download_webpage(self, video_id):
562 """Report webpage download."""
563 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
565 def report_extraction(self, video_id):
566 """Report information extraction."""
567 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
569 def _real_initialize(self):
570 # Retrieve disclaimer
571 request = compat_urllib_request.Request(self._DISCLAIMER)
573 self.report_disclaimer()
574 disclaimer = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
582 'submit': "Continue - I'm over 18",
584 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
586 self.report_age_confirmation()
587 disclaimer = compat_urllib_request.urlopen(request).read()
588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
592 def _real_extract(self, url):
593 # Extract id and simplified title from URL
594 mobj = re.match(self._VALID_URL, url)
596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
599 video_id = mobj.group(1)
601 # Check if video comes from YouTube
602 mobj2 = re.match(r'^yt-(.*)$', video_id)
603 if mobj2 is not None:
604 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
607 # Retrieve video webpage to extract further information
608 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
610 self.report_download_webpage(video_id)
611 webpage = compat_urllib_request.urlopen(request).read()
612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
616 # Extract URL, uploader and title from webpage
617 self.report_extraction(video_id)
618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
620 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621 video_extension = mediaURL[-3:]
623 # Extract gdaKey if available
624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
628 gdaKey = mobj.group(1)
629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
633 self._downloader.trouble(u'ERROR: unable to extract media URL')
635 vardict = compat_parse_qs(mobj.group(1))
636 if 'mediaData' not in vardict:
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = mobj.group(1).replace('\\/', '/')
644 video_extension = mediaURL[-3:]
645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = mobj.group(1).decode('utf-8')
653 mobj = re.search(r'submitter=(.*?);', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
669 class DailymotionIE(InfoExtractor):
670 """Information Extractor for Dailymotion"""
672 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673 IE_NAME = u'dailymotion'
675 def __init__(self, downloader=None):
676 InfoExtractor.__init__(self, downloader)
678 def report_extraction(self, video_id):
679 """Report information extraction."""
680 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
682 def _real_extract(self, url):
683 # Extract id and simplified title from URL
684 mobj = re.match(self._VALID_URL, url)
686 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
689 video_id = mobj.group(1).split('_')[0].split('?')[0]
691 video_extension = 'mp4'
693 # Retrieve video webpage to extract further information
694 request = compat_urllib_request.Request(url)
695 request.add_header('Cookie', 'family_filter=off')
696 webpage = self._download_webpage(request, video_id)
698 # Extract URL, uploader and title from webpage
699 self.report_extraction(video_id)
700 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract media URL')
704 flashvars = compat_urllib_parse.unquote(mobj.group(1))
706 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
709 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
715 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
717 self._downloader.trouble(u'ERROR: unable to extract video URL')
720 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
722 # TODO: support choosing qualities
724 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
726 self._downloader.trouble(u'ERROR: unable to extract title')
728 video_title = unescapeHTML(mobj.group('title'))
730 video_uploader = None
731 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
733 # lookin for official user
734 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
735 if mobj_official is None:
736 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
738 video_uploader = mobj_official.group(1)
740 video_uploader = mobj.group(1)
742 video_upload_date = None
743 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
745 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
750 'uploader': video_uploader,
751 'upload_date': video_upload_date,
752 'title': video_title,
753 'ext': video_extension,
757 class PhotobucketIE(InfoExtractor):
758 """Information extractor for photobucket.com."""
760 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
761 IE_NAME = u'photobucket'
763 def __init__(self, downloader=None):
764 InfoExtractor.__init__(self, downloader)
766 def report_download_webpage(self, video_id):
767 """Report webpage download."""
768 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
770 def report_extraction(self, video_id):
771 """Report information extraction."""
772 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
774 def _real_extract(self, url):
775 # Extract id from URL
776 mobj = re.match(self._VALID_URL, url)
778 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
781 video_id = mobj.group(1)
783 video_extension = 'flv'
785 # Retrieve video webpage to extract further information
786 request = compat_urllib_request.Request(url)
788 self.report_download_webpage(video_id)
789 webpage = compat_urllib_request.urlopen(request).read()
790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
794 # Extract URL, uploader, and title from webpage
795 self.report_extraction(video_id)
796 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
798 self._downloader.trouble(u'ERROR: unable to extract media URL')
800 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
804 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
806 self._downloader.trouble(u'ERROR: unable to extract title')
808 video_title = mobj.group(1).decode('utf-8')
810 video_uploader = mobj.group(2).decode('utf-8')
813 'id': video_id.decode('utf-8'),
814 'url': video_url.decode('utf-8'),
815 'uploader': video_uploader,
817 'title': video_title,
818 'ext': video_extension.decode('utf-8'),
822 class YahooIE(InfoExtractor):
823 """Information extractor for video.yahoo.com."""
826 # _VALID_URL matches all Yahoo! Video URLs
827 # _VPAGE_URL matches only the extractable '/watch/' URLs
828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
830 IE_NAME = u'video.yahoo'
832 def __init__(self, downloader=None):
833 InfoExtractor.__init__(self, downloader)
835 def report_download_webpage(self, video_id):
836 """Report webpage download."""
837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
839 def report_extraction(self, video_id):
840 """Report information extraction."""
841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
843 def _real_extract(self, url, new_video=True):
844 # Extract ID from URL
845 mobj = re.match(self._VALID_URL, url)
847 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
850 video_id = mobj.group(2)
851 video_extension = 'flv'
853 # Rewrite valid but non-extractable URLs as
854 # extractable English language /watch/ URLs
855 if re.match(self._VPAGE_URL, url) is None:
856 request = compat_urllib_request.Request(url)
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
863 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
865 self._downloader.trouble(u'ERROR: Unable to extract id field')
867 yahoo_id = mobj.group(1)
869 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
871 self._downloader.trouble(u'ERROR: Unable to extract vid field')
873 yahoo_vid = mobj.group(1)
875 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
876 return self._real_extract(url, new_video=False)
878 # Retrieve video webpage to extract further information
879 request = compat_urllib_request.Request(url)
881 self.report_download_webpage(video_id)
882 webpage = compat_urllib_request.urlopen(request).read()
883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
887 # Extract uploader and title from webpage
888 self.report_extraction(video_id)
889 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
891 self._downloader.trouble(u'ERROR: unable to extract video title')
893 video_title = mobj.group(1).decode('utf-8')
895 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
897 self._downloader.trouble(u'ERROR: unable to extract video uploader')
899 video_uploader = mobj.group(1).decode('utf-8')
901 # Extract video thumbnail
902 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
906 video_thumbnail = mobj.group(1).decode('utf-8')
908 # Extract video description
909 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video description')
913 video_description = mobj.group(1).decode('utf-8')
914 if not video_description:
915 video_description = 'No description available.'
917 # Extract video height and width
918 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video height')
922 yv_video_height = mobj.group(1)
924 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video width')
928 yv_video_width = mobj.group(1)
930 # Retrieve video playlist to extract media URL
931 # I'm not completely sure what all these options are, but we
932 # seem to need most of them, otherwise the server sends a 401.
933 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
934 yv_bitrate = '700' # according to Wikipedia this is hard-coded
935 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
936 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
937 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
939 self.report_download_webpage(video_id)
940 webpage = compat_urllib_request.urlopen(request).read()
941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
945 # Extract media URL from playlist XML
946 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
948 self._downloader.trouble(u'ERROR: Unable to extract media URL')
950 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
951 video_url = unescapeHTML(video_url)
954 'id': video_id.decode('utf-8'),
956 'uploader': video_uploader,
958 'title': video_title,
959 'ext': video_extension.decode('utf-8'),
960 'thumbnail': video_thumbnail.decode('utf-8'),
961 'description': video_description,
965 class VimeoIE(InfoExtractor):
966 """Information extractor for vimeo.com."""
968 # _VALID_URL matches Vimeo URLs
969 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
972 def __init__(self, downloader=None):
973 InfoExtractor.__init__(self, downloader)
975 def report_download_webpage(self, video_id):
976 """Report webpage download."""
977 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
979 def report_extraction(self, video_id):
980 """Report information extraction."""
981 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
983 def _real_extract(self, url, new_video=True):
984 # Extract ID from URL
985 mobj = re.match(self._VALID_URL, url)
987 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
990 video_id = mobj.group(1)
992 # Retrieve video webpage to extract further information
993 request = compat_urllib_request.Request(url, None, std_headers)
995 self.report_download_webpage(video_id)
996 webpage_bytes = compat_urllib_request.urlopen(request).read()
997 webpage = webpage_bytes.decode('utf-8')
998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1002 # Now we begin extracting as much information as we can from what we
1003 # retrieved. First we extract the information common to all extractors,
1004 # and latter we extract those that are Vimeo specific.
1005 self.report_extraction(video_id)
1007 # Extract the config JSON
1009 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010 config = json.loads(config)
1012 self._downloader.trouble(u'ERROR: unable to extract info section')
1016 video_title = config["video"]["title"]
1018 # Extract uploader and uploader_id
1019 video_uploader = config["video"]["owner"]["name"]
1020 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1022 # Extract video thumbnail
1023 video_thumbnail = config["video"]["thumbnail"]
1025 # Extract video description
1026 video_description = get_element_by_attribute("itemprop", "description", webpage)
1027 if video_description: video_description = clean_html(video_description)
1028 else: video_description = ''
1030 # Extract upload date
1031 video_upload_date = None
1032 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033 if mobj is not None:
1034 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1036 # Vimeo specific: extract request signature and timestamp
1037 sig = config['request']['signature']
1038 timestamp = config['request']['timestamp']
1040 # Vimeo specific: extract video codec and quality information
1041 # First consider quality, then codecs, then take everything
1042 # TODO bind to format param
1043 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044 files = { 'hd': [], 'sd': [], 'other': []}
1045 for codec_name, codec_extension in codecs:
1046 if codec_name in config["video"]["files"]:
1047 if 'hd' in config["video"]["files"][codec_name]:
1048 files['hd'].append((codec_name, codec_extension, 'hd'))
1049 elif 'sd' in config["video"]["files"][codec_name]:
1050 files['sd'].append((codec_name, codec_extension, 'sd'))
1052 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1054 for quality in ('hd', 'sd', 'other'):
1055 if len(files[quality]) > 0:
1056 video_quality = files[quality][0][2]
1057 video_codec = files[quality][0][0]
1058 video_extension = files[quality][0][1]
1059 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1062 self._downloader.trouble(u'ERROR: no known codec found')
1065 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071 'uploader': video_uploader,
1072 'uploader_id': video_uploader_id,
1073 'upload_date': video_upload_date,
1074 'title': video_title,
1075 'ext': video_extension,
1076 'thumbnail': video_thumbnail,
1077 'description': video_description,
1081 class ArteTvIE(InfoExtractor):
1082 """arte.tv information extractor."""
1084 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085 _LIVE_URL = r'index-[0-9]+\.html$'
1087 IE_NAME = u'arte.tv'
1089 def __init__(self, downloader=None):
1090 InfoExtractor.__init__(self, downloader)
1092 def report_download_webpage(self, video_id):
1093 """Report webpage download."""
1094 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1096 def report_extraction(self, video_id):
1097 """Report information extraction."""
1098 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1100 def fetch_webpage(self, url):
1101 request = compat_urllib_request.Request(url)
1103 self.report_download_webpage(url)
1104 webpage = compat_urllib_request.urlopen(request).read()
1105 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1108 except ValueError as err:
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1113 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114 page = self.fetch_webpage(url)
1115 mobj = re.search(regex, page, regexFlags)
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122 for (i, key, err) in matchTuples:
1123 if mobj.group(i) is None:
1124 self._downloader.trouble(err)
1127 info[key] = mobj.group(i)
1131 def extractLiveStream(self, url):
1132 video_lang = url.split('/')[-4]
1133 info = self.grep_webpage(
1135 r'src="(.*?/videothek_js.*?\.js)',
1138 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1141 http_host = url.split('/')[2]
1142 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143 info = self.grep_webpage(
1145 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146 '(http://.*?\.swf).*?' +
1150 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1151 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1155 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1157 def extractPlus7Stream(self, url):
1158 video_lang = url.split('/')[-3]
1159 info = self.grep_webpage(
1161 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1164 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1167 next_url = compat_urllib_parse.unquote(info.get('url'))
1168 info = self.grep_webpage(
1170 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1173 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1176 next_url = compat_urllib_parse.unquote(info.get('url'))
1178 info = self.grep_webpage(
1180 r'<video id="(.*?)".*?>.*?' +
1181 '<name>(.*?)</name>.*?' +
1182 '<dateVideo>(.*?)</dateVideo>.*?' +
1183 '<url quality="hd">(.*?)</url>',
1186 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1187 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1189 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1194 'id': info.get('id'),
1195 'url': compat_urllib_parse.unquote(info.get('url')),
1196 'uploader': u'arte.tv',
1197 'upload_date': info.get('date'),
1198 'title': info.get('title').decode('utf-8'),
1204 def _real_extract(self, url):
1205 video_id = url.split('/')[-1]
1206 self.report_extraction(video_id)
1208 if re.search(self._LIVE_URL, video_id) is not None:
1209 self.extractLiveStream(url)
1212 info = self.extractPlus7Stream(url)
1217 class GenericIE(InfoExtractor):
1218 """Generic last-resort information extractor."""
1221 IE_NAME = u'generic'
1223 def __init__(self, downloader=None):
1224 InfoExtractor.__init__(self, downloader)
1226 def report_download_webpage(self, video_id):
1227 """Report webpage download."""
1228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1231 def report_extraction(self, video_id):
1232 """Report information extraction."""
1233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1235 def report_following_redirect(self, new_url):
1236 """Report information extraction."""
1237 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1239 def _test_redirect(self, url):
1240 """Check if it is a redirect, like url shorteners, in case restart chain."""
1241 class HeadRequest(compat_urllib_request.Request):
1242 def get_method(self):
1245 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1247 Subclass the HTTPRedirectHandler to make it use our
1248 HeadRequest also on the redirected URL
1250 def redirect_request(self, req, fp, code, msg, headers, newurl):
1251 if code in (301, 302, 303, 307):
1252 newurl = newurl.replace(' ', '%20')
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return HeadRequest(newurl,
1257 origin_req_host=req.get_origin_req_host(),
1260 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1262 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1264 Fallback to GET if HEAD is not allowed (405 HTTP error)
1266 def http_error_405(self, req, fp, code, msg, headers):
1270 newheaders = dict((k,v) for k,v in req.headers.items()
1271 if k.lower() not in ("content-length", "content-type"))
1272 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1274 origin_req_host=req.get_origin_req_host(),
1278 opener = compat_urllib_request.OpenerDirector()
1279 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280 HTTPMethodFallback, HEADRedirectHandler,
1281 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282 opener.add_handler(handler())
1284 response = opener.open(HeadRequest(url))
1285 new_url = response.geturl()
1290 self.report_following_redirect(new_url)
1291 self._downloader.download([new_url])
1294 def _real_extract(self, url):
1295 if self._test_redirect(url): return
1297 video_id = url.split('/')[-1]
1298 request = compat_urllib_request.Request(url)
1300 self.report_download_webpage(video_id)
1301 webpage = compat_urllib_request.urlopen(request).read()
1302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1305 except ValueError as err:
1306 # since this is the last-resort InfoExtractor, if
1307 # this error is thrown, it'll be thrown here
1308 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1311 self.report_extraction(video_id)
1312 # Start with something easy: JW Player in SWFObject
1313 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1315 # Broaden the search a little bit
1316 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1318 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1321 # It's possible that one of the regexes
1322 # matched, but returned an empty group:
1323 if mobj.group(1) is None:
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1327 video_url = compat_urllib_parse.unquote(mobj.group(1))
1328 video_id = os.path.basename(video_url)
1330 # here's a fun little line of code for you:
1331 video_extension = os.path.splitext(video_id)[1][1:]
1332 video_id = os.path.splitext(video_id)[0]
1334 # it's tempting to parse this further, but you would
1335 # have to take into account all the variations like
1336 # Video Title - Site Name
1337 # Site Name | Video Title
1338 # Video Title - Tagline | Site Name
1339 # and so on and so forth; it's just not practical
1340 mobj = re.search(r'<title>(.*)</title>', webpage)
1342 self._downloader.trouble(u'ERROR: unable to extract title')
1344 video_title = mobj.group(1)
1346 # video uploader is domain name
1347 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1351 video_uploader = mobj.group(1)
1356 'uploader': video_uploader,
1357 'upload_date': None,
1358 'title': video_title,
1359 'ext': video_extension,
1363 class YoutubeSearchIE(InfoExtractor):
1364 """Information Extractor for YouTube search queries."""
1365 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367 _max_youtube_results = 1000
1368 IE_NAME = u'youtube:search'
1370 def __init__(self, downloader=None):
1371 InfoExtractor.__init__(self, downloader)
1373 def report_download_page(self, query, pagenum):
1374 """Report attempt to download search page with given number."""
1375 query = query.decode(preferredencoding())
1376 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1378 def _real_extract(self, query):
1379 mobj = re.match(self._VALID_URL, query)
1381 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1384 prefix, query = query.split(':')
1386 query = query.encode('utf-8')
1388 self._download_n_results(query, 1)
1390 elif prefix == 'all':
1391 self._download_n_results(query, self._max_youtube_results)
1397 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1399 elif n > self._max_youtube_results:
1400 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401 n = self._max_youtube_results
1402 self._download_n_results(query, n)
1404 except ValueError: # parsing prefix as integer fails
1405 self._download_n_results(query, 1)
1408 def _download_n_results(self, query, n):
1409 """Downloads a specified number of results for a query"""
1415 while (50 * pagenum) < limit:
1416 self.report_download_page(query, pagenum+1)
1417 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418 request = compat_urllib_request.Request(result_url)
1420 data = compat_urllib_request.urlopen(request).read()
1421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1424 api_response = json.loads(data)['data']
1426 new_ids = list(video['id'] for video in api_response['items'])
1427 video_ids += new_ids
1429 limit = min(n, api_response['totalItems'])
1432 if len(video_ids) > n:
1433 video_ids = video_ids[:n]
1434 for id in video_ids:
1435 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1439 class GoogleSearchIE(InfoExtractor):
1440 """Information Extractor for Google Video search queries."""
1441 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445 _max_google_results = 1000
1446 IE_NAME = u'video.google:search'
1448 def __init__(self, downloader=None):
1449 InfoExtractor.__init__(self, downloader)
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1459 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1462 prefix, query = query.split(':')
1464 query = query.encode('utf-8')
1466 self._download_n_results(query, 1)
1468 elif prefix == 'all':
1469 self._download_n_results(query, self._max_google_results)
1475 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1477 elif n > self._max_google_results:
1478 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479 n = self._max_google_results
1480 self._download_n_results(query, n)
1482 except ValueError: # parsing prefix as integer fails
1483 self._download_n_results(query, 1)
1486 def _download_n_results(self, query, n):
1487 """Downloads a specified number of results for a query"""
1493 self.report_download_page(query, pagenum)
1494 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495 request = compat_urllib_request.Request(result_url)
1497 page = compat_urllib_request.urlopen(request).read()
1498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1502 # Extract video identifiers
1503 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504 video_id = mobj.group(1)
1505 if video_id not in video_ids:
1506 video_ids.append(video_id)
1507 if len(video_ids) == n:
1508 # Specified n videos reached
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1513 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514 for id in video_ids:
1515 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518 pagenum = pagenum + 1
1521 class YahooSearchIE(InfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1525 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528 _MORE_PAGES_INDICATOR = r'\s*Next'
1529 _max_yahoo_results = 1000
1530 IE_NAME = u'video.yahoo:search'
1532 def __init__(self, downloader=None):
1533 InfoExtractor.__init__(self, downloader)
1535 def report_download_page(self, query, pagenum):
1536 """Report attempt to download playlist page with given number."""
1537 query = query.decode(preferredencoding())
1538 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1540 def _real_extract(self, query):
1541 mobj = re.match(self._VALID_URL, query)
1543 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1546 prefix, query = query.split(':')
1548 query = query.encode('utf-8')
1550 self._download_n_results(query, 1)
1552 elif prefix == 'all':
1553 self._download_n_results(query, self._max_yahoo_results)
1559 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1561 elif n > self._max_yahoo_results:
1562 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563 n = self._max_yahoo_results
1564 self._download_n_results(query, n)
1566 except ValueError: # parsing prefix as integer fails
1567 self._download_n_results(query, 1)
1570 def _download_n_results(self, query, n):
1571 """Downloads a specified number of results for a query"""
1574 already_seen = set()
1578 self.report_download_page(query, pagenum)
1579 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580 request = compat_urllib_request.Request(result_url)
1582 page = compat_urllib_request.urlopen(request).read()
1583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1587 # Extract video identifiers
1588 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589 video_id = mobj.group(1)
1590 if video_id not in already_seen:
1591 video_ids.append(video_id)
1592 already_seen.add(video_id)
1593 if len(video_ids) == n:
1594 # Specified n videos reached
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1599 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600 for id in video_ids:
1601 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604 pagenum = pagenum + 1
1607 class YoutubePlaylistIE(InfoExtractor):
1608 """Information Extractor for YouTube playlists."""
1610 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1613 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614 IE_NAME = u'youtube:playlist'
1616 def __init__(self, downloader=None):
1617 InfoExtractor.__init__(self, downloader)
1619 def report_download_page(self, playlist_id, pagenum):
1620 """Report attempt to download playlist page with given number."""
1621 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1623 def _real_extract(self, url):
1624 # Extract playlist id
1625 mobj = re.match(self._VALID_URL, url)
1627 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1631 if mobj.group(3) is not None:
1632 self._downloader.download([mobj.group(3)])
1635 # Download playlist pages
1636 # prefix is 'p' as default for playlists but there are other types that need extra care
1637 playlist_prefix = mobj.group(1)
1638 if playlist_prefix == 'a':
1639 playlist_access = 'artist'
1641 playlist_prefix = 'p'
1642 playlist_access = 'view_play_list'
1643 playlist_id = mobj.group(2)
1648 self.report_download_page(playlist_id, pagenum)
1649 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650 request = compat_urllib_request.Request(url)
1652 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1657 # Extract video identifiers
1659 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660 if mobj.group(1) not in ids_in_page:
1661 ids_in_page.append(mobj.group(1))
1662 video_ids.extend(ids_in_page)
1664 if self._MORE_PAGES_INDICATOR not in page:
1666 pagenum = pagenum + 1
1668 total = len(video_ids)
1670 playliststart = self._downloader.params.get('playliststart', 1) - 1
1671 playlistend = self._downloader.params.get('playlistend', -1)
1672 if playlistend == -1:
1673 video_ids = video_ids[playliststart:]
1675 video_ids = video_ids[playliststart:playlistend]
1677 if len(video_ids) == total:
1678 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1680 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1682 for id in video_ids:
1683 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1687 class YoutubeChannelIE(InfoExtractor):
1688 """Information Extractor for YouTube channels."""
1690 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693 IE_NAME = u'youtube:channel'
1695 def report_download_page(self, channel_id, pagenum):
1696 """Report attempt to download channel page with given number."""
1697 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1699 def _real_extract(self, url):
1700 # Extract channel id
1701 mobj = re.match(self._VALID_URL, url)
1703 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1706 # Download channel pages
1707 channel_id = mobj.group(1)
1712 self.report_download_page(channel_id, pagenum)
1713 url = self._TEMPLATE_URL % (channel_id, pagenum)
1714 request = compat_urllib_request.Request(url)
1716 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721 # Extract video identifiers
1723 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1728 if self._MORE_PAGES_INDICATOR not in page:
1730 pagenum = pagenum + 1
1732 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739 class YoutubeUserIE(InfoExtractor):
1740 """Information Extractor for YouTube users."""
1742 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744 _GDATA_PAGE_SIZE = 50
1745 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747 IE_NAME = u'youtube:user'
1749 def __init__(self, downloader=None):
1750 InfoExtractor.__init__(self, downloader)
1752 def report_download_page(self, username, start_index):
1753 """Report attempt to download user page."""
1754 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1757 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1761 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1764 username = mobj.group(1)
1766 # Download video ids using YouTube Data API. Result size per
1767 # query is limited (currently to 50 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1775 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776 self.report_download_page(username, start_index)
1778 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1781 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1786 # Extract video identifiers
1789 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(mobj.group(1))
1793 video_ids.extend(ids_in_page)
1795 # A little optimization - if current page is not
1796 # "full", ie. does not contain PAGE_SIZE video ids then
1797 # we can assume that this page is the last one - there
1798 # are no more ids on further pages - no need to query
1801 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1806 all_ids_count = len(video_ids)
1807 playliststart = self._downloader.params.get('playliststart', 1) - 1
1808 playlistend = self._downloader.params.get('playlistend', -1)
1810 if playlistend == -1:
1811 video_ids = video_ids[playliststart:]
1813 video_ids = video_ids[playliststart:playlistend]
1815 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816 (username, all_ids_count, len(video_ids)))
1818 for video_id in video_ids:
1819 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1822 class BlipTVUserIE(InfoExtractor):
1823 """Information Extractor for blip.tv users."""
1825 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1827 IE_NAME = u'blip.tv:user'
1829 def __init__(self, downloader=None):
1830 InfoExtractor.__init__(self, downloader)
1832 def report_download_page(self, username, pagenum):
1833 """Report attempt to download user page."""
1834 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835 (self.IE_NAME, username, pagenum))
1837 def _real_extract(self, url):
1839 mobj = re.match(self._VALID_URL, url)
1841 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1844 username = mobj.group(1)
1846 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1848 request = compat_urllib_request.Request(url)
1851 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852 mobj = re.search(r'data-users-id="([^"]+)"', page)
1853 page_base = page_base % mobj.group(1)
1854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1859 # Download video ids using BlipTV Ajax calls. Result size per
1860 # query is limited (currently to 12 videos) so we need to query
1861 # page by page until there are no video ids - it means we got
1868 self.report_download_page(username, pagenum)
1870 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1873 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1878 # Extract video identifiers
1881 for mobj in re.finditer(r'href="/([^"]+)"', page):
1882 if mobj.group(1) not in ids_in_page:
1883 ids_in_page.append(unescapeHTML(mobj.group(1)))
1885 video_ids.extend(ids_in_page)
1887 # A little optimization - if current page is not
1888 # "full", ie. does not contain PAGE_SIZE video ids then
1889 # we can assume that this page is the last one - there
1890 # are no more ids on further pages - no need to query
1893 if len(ids_in_page) < self._PAGE_SIZE:
1898 all_ids_count = len(video_ids)
1899 playliststart = self._downloader.params.get('playliststart', 1) - 1
1900 playlistend = self._downloader.params.get('playlistend', -1)
1902 if playlistend == -1:
1903 video_ids = video_ids[playliststart:]
1905 video_ids = video_ids[playliststart:playlistend]
1907 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1910 for video_id in video_ids:
1911 self._downloader.download([u'http://blip.tv/'+video_id])
1914 class DepositFilesIE(InfoExtractor):
1915 """Information extractor for depositfiles.com"""
1917 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1919 def report_download_webpage(self, file_id):
1920 """Report webpage download."""
1921 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1923 def report_extraction(self, file_id):
1924 """Report information extraction."""
1925 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1927 def _real_extract(self, url):
1928 file_id = url.split('/')[-1]
1929 # Rebuild url in english locale
1930 url = 'http://depositfiles.com/en/files/' + file_id
1932 # Retrieve file webpage with 'Free download' button pressed
1933 free_download_indication = { 'gateway_result' : '1' }
1934 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1936 self.report_download_webpage(file_id)
1937 webpage = compat_urllib_request.urlopen(request).read()
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1942 # Search for the real file URL
1943 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944 if (mobj is None) or (mobj.group(1) is None):
1945 # Try to figure out reason of the error.
1946 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947 if (mobj is not None) and (mobj.group(1) is not None):
1948 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1951 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1954 file_url = mobj.group(1)
1955 file_extension = os.path.splitext(file_url)[1][1:]
1957 # Search for file title
1958 mobj = re.search(r'<b title="(.*?)">', webpage)
1960 self._downloader.trouble(u'ERROR: unable to extract title')
1962 file_title = mobj.group(1).decode('utf-8')
1965 'id': file_id.decode('utf-8'),
1966 'url': file_url.decode('utf-8'),
1968 'upload_date': None,
1969 'title': file_title,
1970 'ext': file_extension.decode('utf-8'),
1974 class FacebookIE(InfoExtractor):
1975 """Information Extractor for Facebook"""
1978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980 _NETRC_MACHINE = 'facebook'
1981 _available_formats = ['video', 'highqual', 'lowqual']
1982 _video_extensions = {
1987 IE_NAME = u'facebook'
1989 def __init__(self, downloader=None):
1990 InfoExtractor.__init__(self, downloader)
1992 def _reporter(self, message):
1993 """Add header and report message."""
1994 self._downloader.to_screen(u'[facebook] %s' % message)
1996 def report_login(self):
1997 """Report attempt to log in."""
1998 self._reporter(u'Logging in')
2000 def report_video_webpage_download(self, video_id):
2001 """Report attempt to download video webpage."""
2002 self._reporter(u'%s: Downloading video webpage' % video_id)
2004 def report_information_extraction(self, video_id):
2005 """Report attempt to extract video information."""
2006 self._reporter(u'%s: Extracting video information' % video_id)
2008 def _parse_page(self, video_webpage):
2009 """Extract video information from page"""
2011 data = {'title': r'\("video_title", "(.*?)"\)',
2012 'description': r'<div class="datawrap">(.*?)</div>',
2013 'owner': r'\("video_owner_name", "(.*?)"\)',
2014 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2017 for piece in data.keys():
2018 mobj = re.search(data[piece], video_webpage)
2019 if mobj is not None:
2020 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024 for fmt in self._available_formats:
2025 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026 if mobj is not None:
2027 # URL is in a Javascript segment inside an escaped Unicode format within
2028 # the generally utf-8 page
2029 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030 video_info['video_urls'] = video_urls
2034 def _real_initialize(self):
2035 if self._downloader is None:
2040 downloader_params = self._downloader.params
2042 # Attempt to use provided username and password or .netrc data
2043 if downloader_params.get('username', None) is not None:
2044 useremail = downloader_params['username']
2045 password = downloader_params['password']
2046 elif downloader_params.get('usenetrc', False):
2048 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049 if info is not None:
2053 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054 except (IOError, netrc.NetrcParseError) as err:
2055 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2058 if useremail is None:
2067 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2070 login_results = compat_urllib_request.urlopen(request).read()
2071 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2078 def _real_extract(self, url):
2079 mobj = re.match(self._VALID_URL, url)
2081 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2083 video_id = mobj.group('ID')
2086 self.report_video_webpage_download(video_id)
2087 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2089 page = compat_urllib_request.urlopen(request)
2090 video_webpage = page.read()
2091 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2095 # Start extracting information
2096 self.report_information_extraction(video_id)
2098 # Extract information
2099 video_info = self._parse_page(video_webpage)
2102 if 'owner' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2105 video_uploader = video_info['owner']
2108 if 'title' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract video title')
2111 video_title = video_info['title']
2112 video_title = video_title.decode('utf-8')
2115 if 'thumbnail' not in video_info:
2116 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117 video_thumbnail = ''
2119 video_thumbnail = video_info['thumbnail']
2123 if 'upload_date' in video_info:
2124 upload_time = video_info['upload_date']
2125 timetuple = email.utils.parsedate_tz(upload_time)
2126 if timetuple is not None:
2128 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2133 video_description = video_info.get('description', 'No description available.')
2135 url_map = video_info['video_urls']
2137 # Decide which formats to download
2138 req_format = self._downloader.params.get('format', None)
2139 format_limit = self._downloader.params.get('format_limit', None)
2141 if format_limit is not None and format_limit in self._available_formats:
2142 format_list = self._available_formats[self._available_formats.index(format_limit):]
2144 format_list = self._available_formats
2145 existing_formats = [x for x in format_list if x in url_map]
2146 if len(existing_formats) == 0:
2147 self._downloader.trouble(u'ERROR: no known formats available for video')
2149 if req_format is None:
2150 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151 elif req_format == 'worst':
2152 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153 elif req_format == '-1':
2154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2157 if req_format not in url_map:
2158 self._downloader.trouble(u'ERROR: requested format not available')
2160 video_url_list = [(req_format, url_map[req_format])] # Specific format
2163 for format_param, video_real_url in video_url_list:
2165 video_extension = self._video_extensions.get(format_param, 'mp4')
2168 'id': video_id.decode('utf-8'),
2169 'url': video_real_url.decode('utf-8'),
2170 'uploader': video_uploader.decode('utf-8'),
2171 'upload_date': upload_date,
2172 'title': video_title,
2173 'ext': video_extension.decode('utf-8'),
2174 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2175 'thumbnail': video_thumbnail.decode('utf-8'),
2176 'description': video_description.decode('utf-8'),
2180 class BlipTVIE(InfoExtractor):
2181 """Information extractor for blip.tv"""
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185 IE_NAME = u'blip.tv'
2187 def report_extraction(self, file_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2191 def report_direct_download(self, title):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2195 def _real_extract(self, url):
2196 mobj = re.match(self._VALID_URL, url)
2198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2205 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206 request = compat_urllib_request.Request(json_url)
2207 self.report_extraction(mobj.group(1))
2210 urlh = compat_urllib_request.urlopen(request)
2211 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212 basename = url.split('/')[-1]
2213 title,ext = os.path.splitext(basename)
2214 title = title.decode('UTF-8')
2215 ext = ext.replace('.', '')
2216 self.report_direct_download(title)
2221 'upload_date': None,
2226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2229 if info is None: # Regular URL
2231 json_code_bytes = urlh.read()
2232 json_code = json_code_bytes.decode('utf-8')
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2238 json_data = json.loads(json_code)
2239 if 'Post' in json_data:
2240 data = json_data['Post']
2244 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245 video_url = data['media']['url']
2246 umobj = re.match(self._URL_EXT, video_url)
2248 raise ValueError('Can not determine filename extension')
2249 ext = umobj.group(1)
2252 'id': data['item_id'],
2254 'uploader': data['display_name'],
2255 'upload_date': upload_date,
2256 'title': data['title'],
2258 'format': data['media']['mimeType'],
2259 'thumbnail': data['thumbnailUrl'],
2260 'description': data['description'],
2261 'player_url': data['embedUrl']
2263 except (ValueError,KeyError) as err:
2264 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2267 std_headers['User-Agent'] = 'iTunes/10.6.1'
2271 class MyVideoIE(InfoExtractor):
2272 """Information Extractor for myvideo.de."""
2274 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275 IE_NAME = u'myvideo'
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2280 def report_extraction(self, video_id):
2281 """Report information extraction."""
2282 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2284 def _real_extract(self,url):
2285 mobj = re.match(self._VALID_URL, url)
2287 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2290 video_id = mobj.group(1)
2293 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294 webpage = self._download_webpage(webpage_url, video_id)
2296 self.report_extraction(video_id)
2297 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2300 self._downloader.trouble(u'ERROR: unable to extract media URL')
2302 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2304 mobj = re.search('<title>([^<]+)</title>', webpage)
2306 self._downloader.trouble(u'ERROR: unable to extract title')
2309 video_title = mobj.group(1)
2315 'upload_date': None,
2316 'title': video_title,
2320 class ComedyCentralIE(InfoExtractor):
2321 """Information extractor for The Daily Show and Colbert Report """
2323 # urls can be abbreviations like :thedailyshow or :colbert
2324 # urls for episodes like:
2325 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329 |(https?://)?(www\.)?
2330 (?P<showname>thedailyshow|colbertnation)\.com/
2331 (full-episodes/(?P<episode>.*)|
2333 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2336 IE_NAME = u'comedycentral'
2338 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2340 _video_extensions = {
2348 _video_dimensions = {
2357 def suitable(self, url):
2358 """Receives a URL and returns True if suitable for this IE."""
2359 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2361 def report_extraction(self, episode_id):
2362 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2364 def report_config_download(self, episode_id, media_id):
2365 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2367 def report_index_download(self, episode_id):
2368 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2370 def _print_formats(self, formats):
2371 print('Available formats:')
2373 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2376 def _real_extract(self, url):
2377 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2379 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2382 if mobj.group('shortname'):
2383 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2384 url = u'http://www.thedailyshow.com/full-episodes/'
2386 url = u'http://www.colbertnation.com/full-episodes/'
2387 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2388 assert mobj is not None
2390 if mobj.group('clip'):
2391 if mobj.group('showname') == 'thedailyshow':
2392 epTitle = mobj.group('tdstitle')
2394 epTitle = mobj.group('cntitle')
2397 dlNewest = not mobj.group('episode')
2399 epTitle = mobj.group('showname')
2401 epTitle = mobj.group('episode')
2403 req = compat_urllib_request.Request(url)
2404 self.report_extraction(epTitle)
2406 htmlHandle = compat_urllib_request.urlopen(req)
2407 html = htmlHandle.read()
2408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2409 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2412 url = htmlHandle.geturl()
2413 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2415 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2417 if mobj.group('episode') == '':
2418 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2420 epTitle = mobj.group('episode')
2422 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2424 if len(mMovieParams) == 0:
2425 # The Colbert Report embeds the information in a without
2426 # a URL prefix; so extract the alternate reference
2427 # and then add the URL prefix manually.
2429 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2430 if len(altMovieParams) == 0:
2431 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2434 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2436 uri = mMovieParams[0][1]
2437 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2438 self.report_index_download(epTitle)
2440 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2441 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2442 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2447 idoc = xml.etree.ElementTree.fromstring(indexXml)
2448 itemEls = idoc.findall('.//item')
2449 for itemEl in itemEls:
2450 mediaId = itemEl.findall('./guid')[0].text
2451 shortMediaId = mediaId.split(':')[-1]
2452 showId = mediaId.split(':')[-2].replace('.com', '')
2453 officialTitle = itemEl.findall('./title')[0].text
2454 officialDate = itemEl.findall('./pubDate')[0].text
2456 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2457 compat_urllib_parse.urlencode({'uri': mediaId}))
2458 configReq = compat_urllib_request.Request(configUrl)
2459 self.report_config_download(epTitle, shortMediaId)
2461 configXml = compat_urllib_request.urlopen(configReq).read()
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2466 cdoc = xml.etree.ElementTree.fromstring(configXml)
2468 for rendition in cdoc.findall('.//rendition'):
2469 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2473 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2476 if self._downloader.params.get('listformats', None):
2477 self._print_formats([i[0] for i in turls])
2480 # For now, just pick the highest bitrate
2481 format,rtmp_video_url = turls[-1]
2483 # Get the format arg from the arg stream
2484 req_format = self._downloader.params.get('format', None)
2486 # Select format if we can find one
2489 format, rtmp_video_url = f, v
2492 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2494 raise ExtractorError(u'Cannot transform RTMP url')
2495 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2496 video_url = base + m.group('finalid')
2498 effTitle = showId + u'-' + epTitle
2503 'upload_date': officialDate,
2508 'description': officialTitle,
2510 results.append(info)
2515 class EscapistIE(InfoExtractor):
2516 """Information extractor for The Escapist """
2518 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2519 IE_NAME = u'escapist'
2521 def report_extraction(self, showName):
2522 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2524 def report_config_download(self, showName):
2525 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2527 def _real_extract(self, url):
2528 mobj = re.match(self._VALID_URL, url)
2530 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2532 showName = mobj.group('showname')
2533 videoId = mobj.group('episode')
2535 self.report_extraction(showName)
2537 webPage = compat_urllib_request.urlopen(url)
2538 webPageBytes = webPage.read()
2539 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2540 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2541 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2542 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2545 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2546 description = unescapeHTML(descMatch.group(1))
2547 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2548 imgUrl = unescapeHTML(imgMatch.group(1))
2549 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2550 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2551 configUrlMatch = re.search('config=(.*)$', playerUrl)
2552 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2554 self.report_config_download(showName)
2556 configJSON = compat_urllib_request.urlopen(configUrl)
2557 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2558 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2560 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2563 # Technically, it's JavaScript, not JSON
2564 configJSON = configJSON.replace("'", '"')
2567 config = json.loads(configJSON)
2568 except (ValueError,) as err:
2569 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2572 playlist = config['playlist']
2573 videoUrl = playlist[1]['url']
2578 'uploader': showName,
2579 'upload_date': None,
2582 'thumbnail': imgUrl,
2583 'description': description,
2584 'player_url': playerUrl,
2589 class CollegeHumorIE(InfoExtractor):
2590 """Information extractor for collegehumor.com"""
2593 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2594 IE_NAME = u'collegehumor'
2596 def report_manifest(self, video_id):
2597 """Report information extraction."""
2598 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2600 def report_extraction(self, video_id):
2601 """Report information extraction."""
2602 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2604 def _real_extract(self, url):
2605 mobj = re.match(self._VALID_URL, url)
2607 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2609 video_id = mobj.group('videoid')
2614 'upload_date': None,
2617 self.report_extraction(video_id)
2618 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2620 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2621 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2622 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2625 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2627 videoNode = mdoc.findall('./video')[0]
2628 info['description'] = videoNode.findall('./description')[0].text
2629 info['title'] = videoNode.findall('./caption')[0].text
2630 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2631 manifest_url = videoNode.findall('./file')[0].text
2633 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2636 manifest_url += '?hdcore=2.10.3'
2637 self.report_manifest(video_id)
2639 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2640 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2641 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2644 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2646 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2647 node_id = media_node.attrib['url']
2648 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2649 except IndexError as err:
2650 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2653 url_pr = compat_urllib_parse_urlparse(manifest_url)
2654 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2661 class XVideosIE(InfoExtractor):
2662 """Information extractor for xvideos.com"""
2664 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2665 IE_NAME = u'xvideos'
2667 def report_extraction(self, video_id):
2668 """Report information extraction."""
2669 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2671 def _real_extract(self, url):
2672 mobj = re.match(self._VALID_URL, url)
2674 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2676 video_id = mobj.group(1)
2678 webpage = self._download_webpage(url, video_id)
2680 self.report_extraction(video_id)
2684 mobj = re.search(r'flv_url=(.+?)&', webpage)
2686 self._downloader.trouble(u'ERROR: unable to extract video url')
2688 video_url = compat_urllib_parse.unquote(mobj.group(1))
2692 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2694 self._downloader.trouble(u'ERROR: unable to extract video title')
2696 video_title = mobj.group(1)
2699 # Extract video thumbnail
2700 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2702 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2704 video_thumbnail = mobj.group(0)
2710 'upload_date': None,
2711 'title': video_title,
2713 'thumbnail': video_thumbnail,
2714 'description': None,
2720 class SoundcloudIE(InfoExtractor):
2721 """Information extractor for soundcloud.com
2722 To access the media, the uid of the song and a stream token
2723 must be extracted from the page source and the script must make
2724 a request to media.soundcloud.com/crossdomain.xml. Then
2725 the media can be grabbed by requesting from an url composed
2726 of the stream token and uid
2729 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2730 IE_NAME = u'soundcloud'
2732 def __init__(self, downloader=None):
2733 InfoExtractor.__init__(self, downloader)
2735 def report_resolve(self, video_id):
2736 """Report information extraction."""
2737 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2739 def report_extraction(self, video_id):
2740 """Report information extraction."""
2741 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2743 def _real_extract(self, url):
2744 mobj = re.match(self._VALID_URL, url)
2746 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2749 # extract uploader (which is in the url)
2750 uploader = mobj.group(1)
2751 # extract simple title (uploader + slug of song title)
2752 slug_title = mobj.group(2)
2753 simple_title = uploader + u'-' + slug_title
2755 self.report_resolve('%s/%s' % (uploader, slug_title))
2757 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2758 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2759 request = compat_urllib_request.Request(resolv_url)
2761 info_json_bytes = compat_urllib_request.urlopen(request).read()
2762 info_json = info_json_bytes.decode('utf-8')
2763 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2764 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2767 info = json.loads(info_json)
2768 video_id = info['id']
2769 self.report_extraction('%s/%s' % (uploader, slug_title))
2771 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2772 request = compat_urllib_request.Request(streams_url)
2774 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2775 stream_json = stream_json_bytes.decode('utf-8')
2776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2777 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2780 streams = json.loads(stream_json)
2781 mediaURL = streams['http_mp3_128_url']
2786 'uploader': info['user']['username'],
2787 'upload_date': info['created_at'],
2788 'title': info['title'],
2790 'description': info['description'],
2794 class InfoQIE(InfoExtractor):
2795 """Information extractor for infoq.com"""
2796 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2798 def report_extraction(self, video_id):
2799 """Report information extraction."""
2800 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2802 def _real_extract(self, url):
2803 mobj = re.match(self._VALID_URL, url)
2805 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2808 webpage = self._download_webpage(url, video_id=url)
2809 self.report_extraction(url)
2812 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2814 self._downloader.trouble(u'ERROR: unable to extract video url')
2816 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2817 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2820 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2822 self._downloader.trouble(u'ERROR: unable to extract video title')
2824 video_title = mobj.group(1)
2826 # Extract description
2827 video_description = u'No description available.'
2828 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2829 if mobj is not None:
2830 video_description = mobj.group(1)
2832 video_filename = video_url.split('/')[-1]
2833 video_id, extension = video_filename.split('.')
2839 'upload_date': None,
2840 'title': video_title,
2841 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2843 'description': video_description,
2848 class MixcloudIE(InfoExtractor):
2849 """Information extractor for www.mixcloud.com"""
2851 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2852 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2853 IE_NAME = u'mixcloud'
2855 def __init__(self, downloader=None):
2856 InfoExtractor.__init__(self, downloader)
2858 def report_download_json(self, file_id):
2859 """Report JSON download."""
2860 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2862 def report_extraction(self, file_id):
2863 """Report information extraction."""
2864 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2866 def get_urls(self, jsonData, fmt, bitrate='best'):
2867 """Get urls from 'audio_formats' section in json"""
2870 bitrate_list = jsonData[fmt]
2871 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2872 bitrate = max(bitrate_list) # select highest
2874 url_list = jsonData[fmt][bitrate]
2875 except TypeError: # we have no bitrate info.
2876 url_list = jsonData[fmt]
2879 def check_urls(self, url_list):
2880 """Returns 1st active url from list"""
2881 for url in url_list:
2883 compat_urllib_request.urlopen(url)
2885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2890 def _print_formats(self, formats):
2891 print('Available formats:')
2892 for fmt in formats.keys():
2893 for b in formats[fmt]:
2895 ext = formats[fmt][b][0]
2896 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2897 except TypeError: # we have no bitrate info
2898 ext = formats[fmt][0]
2899 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2902 def _real_extract(self, url):
2903 mobj = re.match(self._VALID_URL, url)
2905 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2907 # extract uploader & filename from url
2908 uploader = mobj.group(1).decode('utf-8')
2909 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2911 # construct API request
2912 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2913 # retrieve .json file with links to files
2914 request = compat_urllib_request.Request(file_url)
2916 self.report_download_json(file_url)
2917 jsonData = compat_urllib_request.urlopen(request).read()
2918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2919 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2923 json_data = json.loads(jsonData)
2924 player_url = json_data['player_swf_url']
2925 formats = dict(json_data['audio_formats'])
2927 req_format = self._downloader.params.get('format', None)
2930 if self._downloader.params.get('listformats', None):
2931 self._print_formats(formats)
2934 if req_format is None or req_format == 'best':
2935 for format_param in formats.keys():
2936 url_list = self.get_urls(formats, format_param)
2938 file_url = self.check_urls(url_list)
2939 if file_url is not None:
2942 if req_format not in formats:
2943 self._downloader.trouble(u'ERROR: format is not available')
2946 url_list = self.get_urls(formats, req_format)
2947 file_url = self.check_urls(url_list)
2948 format_param = req_format
2951 'id': file_id.decode('utf-8'),
2952 'url': file_url.decode('utf-8'),
2953 'uploader': uploader.decode('utf-8'),
2954 'upload_date': None,
2955 'title': json_data['name'],
2956 'ext': file_url.split('.')[-1].decode('utf-8'),
2957 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2958 'thumbnail': json_data['thumbnail_url'],
2959 'description': json_data['description'],
2960 'player_url': player_url.decode('utf-8'),
2963 class StanfordOpenClassroomIE(InfoExtractor):
2964 """Information extractor for Stanford's Open ClassRoom"""
2966 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2967 IE_NAME = u'stanfordoc'
2969 def report_download_webpage(self, objid):
2970 """Report information extraction."""
2971 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2973 def report_extraction(self, video_id):
2974 """Report information extraction."""
2975 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2977 def _real_extract(self, url):
2978 mobj = re.match(self._VALID_URL, url)
2980 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2983 if mobj.group('course') and mobj.group('video'): # A specific video
2984 course = mobj.group('course')
2985 video = mobj.group('video')
2987 'id': course + '_' + video,
2989 'upload_date': None,
2992 self.report_extraction(info['id'])
2993 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2994 xmlUrl = baseUrl + video + '.xml'
2996 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2997 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2998 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3000 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3002 info['title'] = mdoc.findall('./title')[0].text
3003 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3005 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3007 info['ext'] = info['url'].rpartition('.')[2]
3009 elif mobj.group('course'): # A course page
3010 course = mobj.group('course')
3015 'upload_date': None,
3018 self.report_download_webpage(info['id'])
3020 coursepage = compat_urllib_request.urlopen(url).read()
3021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3025 m = re.search('<h1>([^<]+)</h1>', coursepage)
3027 info['title'] = unescapeHTML(m.group(1))
3029 info['title'] = info['id']
3031 m = re.search('<description>([^<]+)</description>', coursepage)
3033 info['description'] = unescapeHTML(m.group(1))
3035 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3038 'type': 'reference',
3039 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3043 for entry in info['list']:
3044 assert entry['type'] == 'reference'
3045 results += self.extract(entry['url'])
3050 'id': 'Stanford OpenClassroom',
3053 'upload_date': None,
3056 self.report_download_webpage(info['id'])
3057 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3059 rootpage = compat_urllib_request.urlopen(rootURL).read()
3060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3064 info['title'] = info['id']
3066 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3069 'type': 'reference',
3070 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3075 for entry in info['list']:
3076 assert entry['type'] == 'reference'
3077 results += self.extract(entry['url'])
3080 class MTVIE(InfoExtractor):
3081 """Information extractor for MTV.com"""
3083 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3086 def report_extraction(self, video_id):
3087 """Report information extraction."""
3088 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3090 def _real_extract(self, url):
3091 mobj = re.match(self._VALID_URL, url)
3093 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3095 if not mobj.group('proto'):
3096 url = 'http://' + url
3097 video_id = mobj.group('videoid')
3099 webpage = self._download_webpage(url, video_id)
3101 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3103 self._downloader.trouble(u'ERROR: unable to extract song name')
3105 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3106 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3108 self._downloader.trouble(u'ERROR: unable to extract performer')
3110 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3111 video_title = performer + ' - ' + song_name
3113 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3115 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3117 mtvn_uri = mobj.group(1)
3119 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3121 self._downloader.trouble(u'ERROR: unable to extract content id')
3123 content_id = mobj.group(1)
3125 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3126 self.report_extraction(video_id)
3127 request = compat_urllib_request.Request(videogen_url)
3129 metadataXml = compat_urllib_request.urlopen(request).read()
3130 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3131 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3134 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3135 renditions = mdoc.findall('.//rendition')
3137 # For now, always pick the highest quality.
3138 rendition = renditions[-1]
3141 _,_,ext = rendition.attrib['type'].partition('/')
3142 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3143 video_url = rendition.find('./src').text
3145 self._downloader.trouble('Invalid rendition field.')
3151 'uploader': performer,
3152 'upload_date': None,
3153 'title': video_title,
3161 class YoukuIE(InfoExtractor):
3162 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3164 def report_download_webpage(self, file_id):
3165 """Report webpage download."""
3166 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3168 def report_extraction(self, file_id):
3169 """Report information extraction."""
3170 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3173 nowTime = int(time.time() * 1000)
3174 random1 = random.randint(1000,1998)
3175 random2 = random.randint(1000,9999)
3177 return "%d%d%d" %(nowTime,random1,random2)
3179 def _get_file_ID_mix_string(self, seed):
3181 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3183 for i in range(len(source)):
3184 seed = (seed * 211 + 30031 ) % 65536
3185 index = math.floor(seed / 65536 * len(source) )
3186 mixed.append(source[int(index)])
3187 source.remove(source[int(index)])
3188 #return ''.join(mixed)
3191 def _get_file_id(self, fileId, seed):
3192 mixed = self._get_file_ID_mix_string(seed)
3193 ids = fileId.split('*')
3197 realId.append(mixed[int(ch)])
3198 return ''.join(realId)
3200 def _real_extract(self, url):
3201 mobj = re.match(self._VALID_URL, url)
3203 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3205 video_id = mobj.group('ID')
3207 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3209 request = compat_urllib_request.Request(info_url, None, std_headers)
3211 self.report_download_webpage(video_id)
3212 jsondata = compat_urllib_request.urlopen(request).read()
3213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3214 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3217 self.report_extraction(video_id)
3219 jsonstr = jsondata.decode('utf-8')
3220 config = json.loads(jsonstr)
3222 video_title = config['data'][0]['title']
3223 seed = config['data'][0]['seed']
3225 format = self._downloader.params.get('format', None)
3226 supported_format = list(config['data'][0]['streamfileids'].keys())
3228 if format is None or format == 'best':
3229 if 'hd2' in supported_format:
3234 elif format == 'worst':
3242 fileid = config['data'][0]['streamfileids'][format]
3243 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3244 except (UnicodeDecodeError, ValueError, KeyError):
3245 self._downloader.trouble(u'ERROR: unable to extract info section')
3249 sid = self._gen_sid()
3250 fileid = self._get_file_id(fileid, seed)
3252 #column 8,9 of fileid represent the segment number
3253 #fileid[7:9] should be changed
3254 for index, key in enumerate(keys):
3256 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3257 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3260 'id': '%s_part%02d' % (video_id, index),
3261 'url': download_url,
3263 'upload_date': None,
3264 'title': video_title,
3267 files_info.append(info)
3272 class XNXXIE(InfoExtractor):
3273 """Information extractor for xnxx.com"""
3275 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3277 VIDEO_URL_RE = r'flv_url=(.*?)&'
3278 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3279 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3281 def report_webpage(self, video_id):
3282 """Report information extraction"""
3283 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3285 def report_extraction(self, video_id):
3286 """Report information extraction"""
3287 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3289 def _real_extract(self, url):
3290 mobj = re.match(self._VALID_URL, url)
3292 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3294 video_id = mobj.group(1)
3296 self.report_webpage(video_id)
3298 # Get webpage content
3300 webpage_bytes = compat_urllib_request.urlopen(url).read()
3301 webpage = webpage_bytes.decode('utf-8')
3302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3303 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3306 result = re.search(self.VIDEO_URL_RE, webpage)
3308 self._downloader.trouble(u'ERROR: unable to extract video url')
3310 video_url = compat_urllib_parse.unquote(result.group(1))
3312 result = re.search(self.VIDEO_TITLE_RE, webpage)
3314 self._downloader.trouble(u'ERROR: unable to extract video title')
3316 video_title = result.group(1)
3318 result = re.search(self.VIDEO_THUMB_RE, webpage)
3320 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3322 video_thumbnail = result.group(1)
3328 'upload_date': None,
3329 'title': video_title,
3331 'thumbnail': video_thumbnail,
3332 'description': None,
3336 class GooglePlusIE(InfoExtractor):
3337 """Information extractor for plus.google.com."""
3339 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3340 IE_NAME = u'plus.google'
3342 def __init__(self, downloader=None):
3343 InfoExtractor.__init__(self, downloader)
3345 def report_extract_entry(self, url):
3346 """Report downloading extry"""
3347 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3349 def report_date(self, upload_date):
3350 """Report downloading extry"""
3351 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3353 def report_uploader(self, uploader):
3354 """Report downloading extry"""
3355 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3357 def report_title(self, video_title):
3358 """Report downloading extry"""
3359 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3361 def report_extract_vid_page(self, video_page):
3362 """Report information extraction."""
3363 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3365 def _real_extract(self, url):
3366 # Extract id from URL
3367 mobj = re.match(self._VALID_URL, url)
3369 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3372 post_url = mobj.group(0)
3373 video_id = mobj.group(1)
3375 video_extension = 'flv'
3377 # Step 1, Retrieve post webpage to extract further information
3378 self.report_extract_entry(post_url)
3379 request = compat_urllib_request.Request(post_url)
3381 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3382 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3383 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3386 # Extract update date
3388 pattern = 'title="Timestamp">(.*?)</a>'
3389 mobj = re.search(pattern, webpage)
3391 upload_date = mobj.group(1)
3392 # Convert timestring to a format suitable for filename
3393 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3394 upload_date = upload_date.strftime('%Y%m%d')
3395 self.report_date(upload_date)
3399 pattern = r'rel\="author".*?>(.*?)</a>'
3400 mobj = re.search(pattern, webpage)
3402 uploader = mobj.group(1)
3403 self.report_uploader(uploader)
3406 # Get the first line for title
3408 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3409 mobj = re.search(pattern, webpage)
3411 video_title = mobj.group(1)
3412 self.report_title(video_title)
3414 # Step 2, Stimulate clicking the image box to launch video
3415 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3416 mobj = re.search(pattern, webpage)
3418 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3420 video_page = mobj.group(1)
3421 request = compat_urllib_request.Request(video_page)
3423 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3424 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3427 self.report_extract_vid_page(video_page)
3430 # Extract video links on video page
3431 """Extract video links of all sizes"""
3432 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3433 mobj = re.findall(pattern, webpage)
3435 self._downloader.trouble(u'ERROR: unable to extract video links')
3437 # Sort in resolution
3438 links = sorted(mobj)
3440 # Choose the lowest of the sort, i.e. highest resolution
3441 video_url = links[-1]
3442 # Only get the url. The resolution part in the tuple has no use anymore
3443 video_url = video_url[-1]
3444 # Treat escaped \u0026 style hex
3446 video_url = video_url.decode("unicode_escape")
3447 except AttributeError: # Python 3
3448 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3454 'uploader': uploader,
3455 'upload_date': upload_date,
3456 'title': video_title,
3457 'ext': video_extension,
3460 class NBAIE(InfoExtractor):
3461 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3464 def _real_extract(self, url):
3465 mobj = re.match(self._VALID_URL, url)
3467 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3470 video_id = mobj.group(1)
3471 if video_id.endswith('/index.html'):
3472 video_id = video_id[:-len('/index.html')]
3474 webpage = self._download_webpage(url, video_id)
3476 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3477 def _findProp(rexp, default=None):
3478 m = re.search(rexp, webpage)
3480 return unescapeHTML(m.group(1))
3484 shortened_video_id = video_id.rpartition('/')[2]
3485 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3487 'id': shortened_video_id,
3491 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3492 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3496 class JustinTVIE(InfoExtractor):
3497 """Information extractor for justin.tv and twitch.tv"""
3498 # TODO: One broadcast may be split into multiple videos. The key
3499 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3500 # starts at 1 and increases. Can we treat all parts as one video?
3502 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3503 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3504 _JUSTIN_PAGE_LIMIT = 100
3505 IE_NAME = u'justin.tv'
3507 def report_extraction(self, file_id):
3508 """Report information extraction."""
3509 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3511 def report_download_page(self, channel, offset):
3512 """Report attempt to download a single page of videos."""
3513 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3514 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3516 # Return count of items, list of *valid* items
3517 def _parse_page(self, url):
3519 urlh = compat_urllib_request.urlopen(url)
3520 webpage_bytes = urlh.read()
3521 webpage = webpage_bytes.decode('utf-8', 'ignore')
3522 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3523 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3526 response = json.loads(webpage)
3528 for clip in response:
3529 video_url = clip['video_file_url']
3531 video_extension = os.path.splitext(video_url)[1][1:]
3532 video_date = re.sub('-', '', clip['created_on'][:10])
3536 'title': clip['title'],
3537 'uploader': clip.get('user_id', clip.get('channel_id')),
3538 'upload_date': video_date,
3539 'ext': video_extension,
3541 return (len(response), info)
3543 def _real_extract(self, url):
3544 mobj = re.match(self._VALID_URL, url)
3546 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3549 api = 'http://api.justin.tv'
3550 video_id = mobj.group(mobj.lastindex)
3552 if mobj.lastindex == 1:
3554 api += '/channel/archives/%s.json'
3556 api += '/clip/show/%s.json'
3557 api = api % (video_id,)
3559 self.report_extraction(video_id)
3563 limit = self._JUSTIN_PAGE_LIMIT
3566 self.report_download_page(video_id, offset)
3567 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3568 page_count, page_info = self._parse_page(page_url)
3569 info.extend(page_info)
3570 if not paged or page_count != limit:
3575 class FunnyOrDieIE(InfoExtractor):
3576 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3578 def _real_extract(self, url):
3579 mobj = re.match(self._VALID_URL, url)
3581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3584 video_id = mobj.group('id')
3585 webpage = self._download_webpage(url, video_id)
3587 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3589 self._downloader.trouble(u'ERROR: unable to find video information')
3590 video_url = unescapeHTML(m.group('url'))
3592 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3594 self._downloader.trouble(u'Cannot find video title')
3595 title = unescapeHTML(m.group('title'))
3597 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3599 desc = unescapeHTML(m.group('desc'))
3608 'description': desc,
3612 class TweetReelIE(InfoExtractor):
3613 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3615 def _real_extract(self, url):
3616 mobj = re.match(self._VALID_URL, url)
3618 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3621 video_id = mobj.group('id')
3622 webpage = self._download_webpage(url, video_id)
3624 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3626 self._downloader.trouble(u'ERROR: Cannot find status ID')
3627 status_id = m.group(1)
3629 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3631 self._downloader.trouble(u'WARNING: Cannot find description')
3632 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3634 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3636 self._downloader.trouble(u'ERROR: Cannot find uploader')
3637 uploader = unescapeHTML(m.group('uploader'))
3638 uploader_id = unescapeHTML(m.group('uploader_id'))
3640 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3642 self._downloader.trouble(u'ERROR: Cannot find upload date')
3643 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3646 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3653 'description': desc,
3654 'uploader': uploader,
3655 'uploader_id': uploader_id,
3656 'internal_id': status_id,
3657 'upload_date': upload_date
3661 class SteamIE(InfoExtractor):
3662 _VALID_URL = r"""http://store.steampowered.com/
3663 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3665 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3668 def suitable(self, url):
3669 """Receives a URL and returns True if suitable for this IE."""
3670 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3672 def _real_extract(self, url):
3673 m = re.match(self._VALID_URL, url, re.VERBOSE)
3674 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3675 gameID = m.group('gameID')
3676 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3677 webpage = self._download_webpage(videourl, gameID)
3678 mweb = re.finditer(urlRE, webpage)
3679 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3680 titles = re.finditer(namesRE, webpage)
3682 for vid,vtitle in zip(mweb,titles):
3683 video_id = vid.group('videoID')
3684 title = vtitle.group('videoName')
3685 video_url = vid.group('videoURL')
3687 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3692 'title': unescapeHTML(title)
3697 class UstreamIE(InfoExtractor):
3698 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3699 IE_NAME = u'ustream'
3701 def _real_extract(self, url):
3702 m = re.match(self._VALID_URL, url)
3703 video_id = m.group('videoID')
3704 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3705 webpage = self._download_webpage(url, video_id)
3706 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3707 title = m.group('title')
3708 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3709 uploader = m.group('uploader')
3715 'uploader': uploader
3720 def gen_extractors():
3721 """ Return a list of an instance of every supported extractor.
3722 The order does matter; the first extractor matched is the one handling the URL.
3725 YoutubePlaylistIE(),
3749 StanfordOpenClassroomIE(),