2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
52 The fields should all be Unicode strings.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
72 self.set_downloader(downloader)
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
79 """Getter method for _WORKING."""
83 """Initializes an instance (authentication, etc)."""
85 self._real_initialize()
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
91 return self._real_extract(url)
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
107 return type(self).__name__[:-2]
109 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
111 note = u'Downloading video webpage'
112 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
114 urlh = compat_urllib_request.urlopen(url_or_request)
115 webpage_bytes = urlh.read()
116 return webpage_bytes.decode('utf-8', 'replace')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 class YoutubeIE(InfoExtractor):
124 """Information extractor for youtube.com."""
128 (?:https?://)? # http(s):// (optional)
129 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
130 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
131 (?:.*?\#/)? # handle anchor (#/) redirect urls
132 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
133 (?: # the various things that can precede the ID:
134 (?:(?:v|embed|e)/) # v/ or embed/ or e/
135 |(?: # or the v= param in all its forms
136 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
137 (?:\?|\#!?) # the params delimiter ? or # or #!
138 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
141 )? # optional -> youtube.com/xxxx is OK
142 )? # all until now is optional -> you can pass the naked ID
143 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
144 (?(1).+)? # if we found the ID, everything can follow
146 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
147 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
148 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
149 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
150 _NETRC_MACHINE = 'youtube'
151 # Listed in order of quality
152 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
153 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
154 _video_extensions = {
160 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
166 _video_dimensions = {
184 def suitable(self, url):
185 """Receives a URL and returns True if suitable for this IE."""
186 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
188 def report_lang(self):
189 """Report attempt to set language."""
190 self._downloader.to_screen(u'[youtube] Setting language')
192 def report_login(self):
193 """Report attempt to log in."""
194 self._downloader.to_screen(u'[youtube] Logging in')
196 def report_age_confirmation(self):
197 """Report attempt to confirm age."""
198 self._downloader.to_screen(u'[youtube] Confirming age')
200 def report_video_webpage_download(self, video_id):
201 """Report attempt to download video webpage."""
202 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
204 def report_video_info_webpage_download(self, video_id):
205 """Report attempt to download video info webpage."""
206 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
208 def report_video_subtitles_download(self, video_id):
209 """Report attempt to download video info webpage."""
210 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
212 def report_information_extraction(self, video_id):
213 """Report attempt to extract video information."""
214 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
216 def report_unavailable_format(self, video_id, format):
217 """Report extracted video URL."""
218 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
220 def report_rtmp_download(self):
221 """Indicate the download will use the RTMP protocol."""
222 self._downloader.to_screen(u'[youtube] RTMP download detected')
224 def _closed_captions_xml_to_srt(self, xml_string):
226 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
227 # TODO parse xml instead of regex
228 for n, (start, dur_tag, dur, caption) in enumerate(texts):
229 if not dur: dur = '4'
231 end = start + float(dur)
232 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
233 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
234 caption = unescapeHTML(caption)
235 caption = unescapeHTML(caption) # double cycle, intentional
236 srt += str(n+1) + '\n'
237 srt += start + ' --> ' + end + '\n'
238 srt += caption + '\n\n'
241 def _extract_subtitles(self, video_id):
242 self.report_video_subtitles_download(video_id)
243 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
245 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
247 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
249 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
250 if not srt_lang_list:
251 return (u'WARNING: video has no closed captions', None)
252 if self._downloader.params.get('subtitleslang', False):
253 srt_lang = self._downloader.params.get('subtitleslang')
254 elif 'en' in srt_lang_list:
257 srt_lang = list(srt_lang_list.keys())[0]
258 if not srt_lang in srt_lang_list:
259 return (u'WARNING: no closed captions found in the specified language', None)
260 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
262 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
263 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
264 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
266 return (u'WARNING: unable to download video subtitles', None)
267 return (None, self._closed_captions_xml_to_srt(srt_xml))
269 def _print_formats(self, formats):
270 print('Available formats:')
272 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
274 def _real_initialize(self):
275 if self._downloader is None:
280 downloader_params = self._downloader.params
282 # Attempt to use provided username and password or .netrc data
283 if downloader_params.get('username', None) is not None:
284 username = downloader_params['username']
285 password = downloader_params['password']
286 elif downloader_params.get('usenetrc', False):
288 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
293 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
294 except (IOError, netrc.NetrcParseError) as err:
295 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
299 request = compat_urllib_request.Request(self._LANG_URL)
302 compat_urllib_request.urlopen(request).read()
303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
304 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
307 # No authentication to be performed
313 'current_form': 'loginForm',
315 'action_login': 'Log In',
316 'username': username,
317 'password': password,
319 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
322 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
323 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
324 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
333 'action_confirm': 'Confirm',
335 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
337 self.report_age_confirmation()
338 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
343 def _extract_id(self, url):
344 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
346 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
348 video_id = mobj.group(2)
351 def _real_extract(self, url):
352 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
353 mobj = re.search(self._NEXT_URL_RE, url)
355 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
356 video_id = self._extract_id(url)
359 self.report_video_webpage_download(video_id)
360 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
361 request = compat_urllib_request.Request(url)
363 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
365 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
368 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
370 # Attempt to extract SWF player URL
371 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
373 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
378 self.report_video_info_webpage_download(video_id)
379 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
380 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
381 % (video_id, el_type))
382 request = compat_urllib_request.Request(video_info_url)
384 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
385 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
386 video_info = compat_parse_qs(video_info_webpage)
387 if 'token' in video_info:
389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
390 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
392 if 'token' not in video_info:
393 if 'reason' in video_info:
394 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
396 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
399 # Check for "rental" videos
400 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
401 self._downloader.trouble(u'ERROR: "rental" videos not supported')
404 # Start extracting information
405 self.report_information_extraction(video_id)
408 if 'author' not in video_info:
409 self._downloader.trouble(u'ERROR: unable to extract uploader name')
411 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
414 video_uploader_id = None
415 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
417 video_uploader_id = mobj.group(1)
419 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
422 if 'title' not in video_info:
423 self._downloader.trouble(u'ERROR: unable to extract video title')
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
428 if 'thumbnail_url' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
440 for expression in format_expressions:
442 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
447 video_description = get_element_by_id("eow-description", video_webpage)
448 if video_description:
449 video_description = clean_html(video_description)
451 video_description = ''
454 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
458 self._downloader.trouble(srt_error)
460 if 'length_seconds' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video duration')
464 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
467 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
469 # Decide which formats to download
470 req_format = self._downloader.params.get('format', None)
472 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
473 self.report_rtmp_download()
474 video_url_list = [(None, video_info['conn'][0])]
475 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
476 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
477 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
478 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
479 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
481 format_limit = self._downloader.params.get('format_limit', None)
482 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
483 if format_limit is not None and format_limit in available_formats:
484 format_list = available_formats[available_formats.index(format_limit):]
486 format_list = available_formats
487 existing_formats = [x for x in format_list if x in url_map]
488 if len(existing_formats) == 0:
489 self._downloader.trouble(u'ERROR: no known formats available for video')
491 if self._downloader.params.get('listformats', None):
492 self._print_formats(existing_formats)
494 if req_format is None or req_format == 'best':
495 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
496 elif req_format == 'worst':
497 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
498 elif req_format in ('-1', 'all'):
499 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
501 # Specific formats. We pick the first in a slash-delimeted sequence.
502 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
503 req_formats = req_format.split('/')
504 video_url_list = None
505 for rf in req_formats:
507 video_url_list = [(rf, url_map[rf])]
509 if video_url_list is None:
510 self._downloader.trouble(u'ERROR: requested format not available')
513 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
517 for format_param, video_real_url in video_url_list:
519 video_extension = self._video_extensions.get(format_param, 'flv')
521 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
522 self._video_dimensions.get(format_param, '???'))
526 'url': video_real_url,
527 'uploader': video_uploader,
528 'uploader_id': video_uploader_id,
529 'upload_date': upload_date,
530 'title': video_title,
531 'ext': video_extension,
532 'format': video_format,
533 'thumbnail': video_thumbnail,
534 'description': video_description,
535 'player_url': player_url,
536 'subtitles': video_subtitles,
537 'duration': video_duration
542 class MetacafeIE(InfoExtractor):
543 """Information Extractor for metacafe.com."""
545 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
546 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
547 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
548 IE_NAME = u'metacafe'
550 def __init__(self, downloader=None):
551 InfoExtractor.__init__(self, downloader)
553 def report_disclaimer(self):
554 """Report disclaimer retrieval."""
555 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
557 def report_age_confirmation(self):
558 """Report attempt to confirm age."""
559 self._downloader.to_screen(u'[metacafe] Confirming age')
561 def report_download_webpage(self, video_id):
562 """Report webpage download."""
563 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
565 def report_extraction(self, video_id):
566 """Report information extraction."""
567 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
569 def _real_initialize(self):
570 # Retrieve disclaimer
571 request = compat_urllib_request.Request(self._DISCLAIMER)
573 self.report_disclaimer()
574 disclaimer = compat_urllib_request.urlopen(request).read()
575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
576 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
582 'submit': "Continue - I'm over 18",
584 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
586 self.report_age_confirmation()
587 disclaimer = compat_urllib_request.urlopen(request).read()
588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
589 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
592 def _real_extract(self, url):
593 # Extract id and simplified title from URL
594 mobj = re.match(self._VALID_URL, url)
596 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
599 video_id = mobj.group(1)
601 # Check if video comes from YouTube
602 mobj2 = re.match(r'^yt-(.*)$', video_id)
603 if mobj2 is not None:
604 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
607 # Retrieve video webpage to extract further information
608 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
610 self.report_download_webpage(video_id)
611 webpage = compat_urllib_request.urlopen(request).read()
612 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
613 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
616 # Extract URL, uploader and title from webpage
617 self.report_extraction(video_id)
618 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
620 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
621 video_extension = mediaURL[-3:]
623 # Extract gdaKey if available
624 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
628 gdaKey = mobj.group(1)
629 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
631 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
633 self._downloader.trouble(u'ERROR: unable to extract media URL')
635 vardict = compat_parse_qs(mobj.group(1))
636 if 'mediaData' not in vardict:
637 self._downloader.trouble(u'ERROR: unable to extract media URL')
639 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
641 self._downloader.trouble(u'ERROR: unable to extract media URL')
643 mediaURL = mobj.group(1).replace('\\/', '/')
644 video_extension = mediaURL[-3:]
645 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
647 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
649 self._downloader.trouble(u'ERROR: unable to extract title')
651 video_title = mobj.group(1).decode('utf-8')
653 mobj = re.search(r'submitter=(.*?);', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
657 video_uploader = mobj.group(1)
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
669 class DailymotionIE(InfoExtractor):
670 """Information Extractor for Dailymotion"""
672 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
673 IE_NAME = u'dailymotion'
675 def __init__(self, downloader=None):
676 InfoExtractor.__init__(self, downloader)
678 def report_extraction(self, video_id):
679 """Report information extraction."""
680 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
682 def _real_extract(self, url):
683 # Extract id and simplified title from URL
684 mobj = re.match(self._VALID_URL, url)
686 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
689 video_id = mobj.group(1).split('_')[0].split('?')[0]
691 video_extension = 'mp4'
693 # Retrieve video webpage to extract further information
694 request = compat_urllib_request.Request(url)
695 request.add_header('Cookie', 'family_filter=off')
696 webpage = self._download_webpage(request, video_id)
698 # Extract URL, uploader and title from webpage
699 self.report_extraction(video_id)
700 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract media URL')
704 flashvars = compat_urllib_parse.unquote(mobj.group(1))
706 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
709 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
715 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
717 self._downloader.trouble(u'ERROR: unable to extract video URL')
720 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
722 # TODO: support choosing qualities
724 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
726 self._downloader.trouble(u'ERROR: unable to extract title')
728 video_title = unescapeHTML(mobj.group('title'))
730 video_uploader = None
731 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
733 # lookin for official user
734 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
735 if mobj_official is None:
736 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
738 video_uploader = mobj_official.group(1)
740 video_uploader = mobj.group(1)
742 video_upload_date = None
743 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
745 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
750 'uploader': video_uploader,
751 'upload_date': video_upload_date,
752 'title': video_title,
753 'ext': video_extension,
757 class PhotobucketIE(InfoExtractor):
758 """Information extractor for photobucket.com."""
760 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
761 IE_NAME = u'photobucket'
763 def __init__(self, downloader=None):
764 InfoExtractor.__init__(self, downloader)
766 def report_download_webpage(self, video_id):
767 """Report webpage download."""
768 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
770 def report_extraction(self, video_id):
771 """Report information extraction."""
772 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
774 def _real_extract(self, url):
775 # Extract id from URL
776 mobj = re.match(self._VALID_URL, url)
778 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
781 video_id = mobj.group(1)
783 video_extension = 'flv'
785 # Retrieve video webpage to extract further information
786 request = compat_urllib_request.Request(url)
788 self.report_download_webpage(video_id)
789 webpage = compat_urllib_request.urlopen(request).read()
790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
791 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
794 # Extract URL, uploader, and title from webpage
795 self.report_extraction(video_id)
796 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
798 self._downloader.trouble(u'ERROR: unable to extract media URL')
800 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
804 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
806 self._downloader.trouble(u'ERROR: unable to extract title')
808 video_title = mobj.group(1).decode('utf-8')
810 video_uploader = mobj.group(2).decode('utf-8')
813 'id': video_id.decode('utf-8'),
814 'url': video_url.decode('utf-8'),
815 'uploader': video_uploader,
817 'title': video_title,
818 'ext': video_extension.decode('utf-8'),
822 class YahooIE(InfoExtractor):
823 """Information extractor for video.yahoo.com."""
826 # _VALID_URL matches all Yahoo! Video URLs
827 # _VPAGE_URL matches only the extractable '/watch/' URLs
828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
830 IE_NAME = u'video.yahoo'
832 def __init__(self, downloader=None):
833 InfoExtractor.__init__(self, downloader)
835 def report_download_webpage(self, video_id):
836 """Report webpage download."""
837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
839 def report_extraction(self, video_id):
840 """Report information extraction."""
841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
843 def _real_extract(self, url, new_video=True):
844 # Extract ID from URL
845 mobj = re.match(self._VALID_URL, url)
847 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
850 video_id = mobj.group(2)
851 video_extension = 'flv'
853 # Rewrite valid but non-extractable URLs as
854 # extractable English language /watch/ URLs
855 if re.match(self._VPAGE_URL, url) is None:
856 request = compat_urllib_request.Request(url)
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
863 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
865 self._downloader.trouble(u'ERROR: Unable to extract id field')
867 yahoo_id = mobj.group(1)
869 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
871 self._downloader.trouble(u'ERROR: Unable to extract vid field')
873 yahoo_vid = mobj.group(1)
875 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
876 return self._real_extract(url, new_video=False)
878 # Retrieve video webpage to extract further information
879 request = compat_urllib_request.Request(url)
881 self.report_download_webpage(video_id)
882 webpage = compat_urllib_request.urlopen(request).read()
883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
884 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
887 # Extract uploader and title from webpage
888 self.report_extraction(video_id)
889 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
891 self._downloader.trouble(u'ERROR: unable to extract video title')
893 video_title = mobj.group(1).decode('utf-8')
895 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
897 self._downloader.trouble(u'ERROR: unable to extract video uploader')
899 video_uploader = mobj.group(1).decode('utf-8')
901 # Extract video thumbnail
902 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
906 video_thumbnail = mobj.group(1).decode('utf-8')
908 # Extract video description
909 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video description')
913 video_description = mobj.group(1).decode('utf-8')
914 if not video_description:
915 video_description = 'No description available.'
917 # Extract video height and width
918 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video height')
922 yv_video_height = mobj.group(1)
924 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video width')
928 yv_video_width = mobj.group(1)
930 # Retrieve video playlist to extract media URL
931 # I'm not completely sure what all these options are, but we
932 # seem to need most of them, otherwise the server sends a 401.
933 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
934 yv_bitrate = '700' # according to Wikipedia this is hard-coded
935 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
936 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
937 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
939 self.report_download_webpage(video_id)
940 webpage = compat_urllib_request.urlopen(request).read()
941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
945 # Extract media URL from playlist XML
946 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
948 self._downloader.trouble(u'ERROR: Unable to extract media URL')
950 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
951 video_url = unescapeHTML(video_url)
954 'id': video_id.decode('utf-8'),
956 'uploader': video_uploader,
958 'title': video_title,
959 'ext': video_extension.decode('utf-8'),
960 'thumbnail': video_thumbnail.decode('utf-8'),
961 'description': video_description,
965 class VimeoIE(InfoExtractor):
966 """Information extractor for vimeo.com."""
968 # _VALID_URL matches Vimeo URLs
969 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
972 def __init__(self, downloader=None):
973 InfoExtractor.__init__(self, downloader)
975 def report_download_webpage(self, video_id):
976 """Report webpage download."""
977 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
979 def report_extraction(self, video_id):
980 """Report information extraction."""
981 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
983 def _real_extract(self, url, new_video=True):
984 # Extract ID from URL
985 mobj = re.match(self._VALID_URL, url)
987 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
990 video_id = mobj.group(1)
992 # Retrieve video webpage to extract further information
993 request = compat_urllib_request.Request(url, None, std_headers)
995 self.report_download_webpage(video_id)
996 webpage_bytes = compat_urllib_request.urlopen(request).read()
997 webpage = webpage_bytes.decode('utf-8')
998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
999 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1002 # Now we begin extracting as much information as we can from what we
1003 # retrieved. First we extract the information common to all extractors,
1004 # and latter we extract those that are Vimeo specific.
1005 self.report_extraction(video_id)
1007 # Extract the config JSON
1009 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1010 config = json.loads(config)
1012 self._downloader.trouble(u'ERROR: unable to extract info section')
1016 video_title = config["video"]["title"]
1018 # Extract uploader and uploader_id
1019 video_uploader = config["video"]["owner"]["name"]
1020 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1022 # Extract video thumbnail
1023 video_thumbnail = config["video"]["thumbnail"]
1025 # Extract video description
1026 video_description = get_element_by_attribute("itemprop", "description", webpage)
1027 if video_description: video_description = clean_html(video_description)
1028 else: video_description = ''
1030 # Extract upload date
1031 video_upload_date = None
1032 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1033 if mobj is not None:
1034 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1036 # Vimeo specific: extract request signature and timestamp
1037 sig = config['request']['signature']
1038 timestamp = config['request']['timestamp']
1040 # Vimeo specific: extract video codec and quality information
1041 # First consider quality, then codecs, then take everything
1042 # TODO bind to format param
1043 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1044 files = { 'hd': [], 'sd': [], 'other': []}
1045 for codec_name, codec_extension in codecs:
1046 if codec_name in config["video"]["files"]:
1047 if 'hd' in config["video"]["files"][codec_name]:
1048 files['hd'].append((codec_name, codec_extension, 'hd'))
1049 elif 'sd' in config["video"]["files"][codec_name]:
1050 files['sd'].append((codec_name, codec_extension, 'sd'))
1052 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1054 for quality in ('hd', 'sd', 'other'):
1055 if len(files[quality]) > 0:
1056 video_quality = files[quality][0][2]
1057 video_codec = files[quality][0][0]
1058 video_extension = files[quality][0][1]
1059 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1062 self._downloader.trouble(u'ERROR: no known codec found')
1065 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1066 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071 'uploader': video_uploader,
1072 'uploader_id': video_uploader_id,
1073 'upload_date': video_upload_date,
1074 'title': video_title,
1075 'ext': video_extension,
1076 'thumbnail': video_thumbnail,
1077 'description': video_description,
1081 class ArteTvIE(InfoExtractor):
1082 """arte.tv information extractor."""
1084 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1085 _LIVE_URL = r'index-[0-9]+\.html$'
1087 IE_NAME = u'arte.tv'
1089 def __init__(self, downloader=None):
1090 InfoExtractor.__init__(self, downloader)
1092 def report_download_webpage(self, video_id):
1093 """Report webpage download."""
1094 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1096 def report_extraction(self, video_id):
1097 """Report information extraction."""
1098 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1100 def fetch_webpage(self, url):
1101 request = compat_urllib_request.Request(url)
1103 self.report_download_webpage(url)
1104 webpage = compat_urllib_request.urlopen(request).read()
1105 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1108 except ValueError as err:
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1113 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1114 page = self.fetch_webpage(url)
1115 mobj = re.search(regex, page, regexFlags)
1119 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122 for (i, key, err) in matchTuples:
1123 if mobj.group(i) is None:
1124 self._downloader.trouble(err)
1127 info[key] = mobj.group(i)
1131 def extractLiveStream(self, url):
1132 video_lang = url.split('/')[-4]
1133 info = self.grep_webpage(
1135 r'src="(.*?/videothek_js.*?\.js)',
1138 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1141 http_host = url.split('/')[2]
1142 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1143 info = self.grep_webpage(
1145 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1146 '(http://.*?\.swf).*?' +
1150 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1151 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1152 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1155 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1157 def extractPlus7Stream(self, url):
1158 video_lang = url.split('/')[-3]
1159 info = self.grep_webpage(
1161 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1164 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1167 next_url = compat_urllib_parse.unquote(info.get('url'))
1168 info = self.grep_webpage(
1170 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1173 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1176 next_url = compat_urllib_parse.unquote(info.get('url'))
1178 info = self.grep_webpage(
1180 r'<video id="(.*?)".*?>.*?' +
1181 '<name>(.*?)</name>.*?' +
1182 '<dateVideo>(.*?)</dateVideo>.*?' +
1183 '<url quality="hd">(.*?)</url>',
1186 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1187 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1188 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1189 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1194 'id': info.get('id'),
1195 'url': compat_urllib_parse.unquote(info.get('url')),
1196 'uploader': u'arte.tv',
1197 'upload_date': info.get('date'),
1198 'title': info.get('title').decode('utf-8'),
1204 def _real_extract(self, url):
1205 video_id = url.split('/')[-1]
1206 self.report_extraction(video_id)
1208 if re.search(self._LIVE_URL, video_id) is not None:
1209 self.extractLiveStream(url)
1212 info = self.extractPlus7Stream(url)
1217 class GenericIE(InfoExtractor):
1218 """Generic last-resort information extractor."""
1221 IE_NAME = u'generic'
1223 def __init__(self, downloader=None):
1224 InfoExtractor.__init__(self, downloader)
1226 def report_download_webpage(self, video_id):
1227 """Report webpage download."""
1228 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1229 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1231 def report_extraction(self, video_id):
1232 """Report information extraction."""
1233 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1235 def report_following_redirect(self, new_url):
1236 """Report information extraction."""
1237 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1239 def _test_redirect(self, url):
1240 """Check if it is a redirect, like url shorteners, in case restart chain."""
1241 class HeadRequest(compat_urllib_request.Request):
1242 def get_method(self):
1245 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1247 Subclass the HTTPRedirectHandler to make it use our
1248 HeadRequest also on the redirected URL
1250 def redirect_request(self, req, fp, code, msg, headers, newurl):
1251 if code in (301, 302, 303, 307):
1252 newurl = newurl.replace(' ', '%20')
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return HeadRequest(newurl,
1257 origin_req_host=req.get_origin_req_host(),
1260 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1262 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1264 Fallback to GET if HEAD is not allowed (405 HTTP error)
1266 def http_error_405(self, req, fp, code, msg, headers):
1270 newheaders = dict((k,v) for k,v in req.headers.items()
1271 if k.lower() not in ("content-length", "content-type"))
1272 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1274 origin_req_host=req.get_origin_req_host(),
1278 opener = compat_urllib_request.OpenerDirector()
1279 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1280 HTTPMethodFallback, HEADRedirectHandler,
1281 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1282 opener.add_handler(handler())
1284 response = opener.open(HeadRequest(url))
1285 new_url = response.geturl()
1290 self.report_following_redirect(new_url)
1291 self._downloader.download([new_url])
1294 def _real_extract(self, url):
1295 if self._test_redirect(url): return
1297 video_id = url.split('/')[-1]
1298 request = compat_urllib_request.Request(url)
1300 self.report_download_webpage(video_id)
1301 webpage = compat_urllib_request.urlopen(request).read()
1302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1303 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1305 except ValueError as err:
1306 # since this is the last-resort InfoExtractor, if
1307 # this error is thrown, it'll be thrown here
1308 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1311 self.report_extraction(video_id)
1312 # Start with something easy: JW Player in SWFObject
1313 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1315 # Broaden the search a little bit
1316 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1318 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1321 # It's possible that one of the regexes
1322 # matched, but returned an empty group:
1323 if mobj.group(1) is None:
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1327 video_url = compat_urllib_parse.unquote(mobj.group(1))
1328 video_id = os.path.basename(video_url)
1330 # here's a fun little line of code for you:
1331 video_extension = os.path.splitext(video_id)[1][1:]
1332 video_id = os.path.splitext(video_id)[0]
1334 # it's tempting to parse this further, but you would
1335 # have to take into account all the variations like
1336 # Video Title - Site Name
1337 # Site Name | Video Title
1338 # Video Title - Tagline | Site Name
1339 # and so on and so forth; it's just not practical
1340 mobj = re.search(r'<title>(.*)</title>', webpage)
1342 self._downloader.trouble(u'ERROR: unable to extract title')
1344 video_title = mobj.group(1)
1346 # video uploader is domain name
1347 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1351 video_uploader = mobj.group(1)
1356 'uploader': video_uploader,
1357 'upload_date': None,
1358 'title': video_title,
1359 'ext': video_extension,
1363 class YoutubeSearchIE(InfoExtractor):
1364 """Information Extractor for YouTube search queries."""
1365 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1366 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1367 _max_youtube_results = 1000
1368 IE_NAME = u'youtube:search'
1370 def __init__(self, downloader=None):
1371 InfoExtractor.__init__(self, downloader)
1373 def report_download_page(self, query, pagenum):
1374 """Report attempt to download search page with given number."""
1375 query = query.decode(preferredencoding())
1376 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1378 def _real_extract(self, query):
1379 mobj = re.match(self._VALID_URL, query)
1381 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1384 prefix, query = query.split(':')
1386 query = query.encode('utf-8')
1388 self._download_n_results(query, 1)
1390 elif prefix == 'all':
1391 self._download_n_results(query, self._max_youtube_results)
1397 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1399 elif n > self._max_youtube_results:
1400 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1401 n = self._max_youtube_results
1402 self._download_n_results(query, n)
1404 except ValueError: # parsing prefix as integer fails
1405 self._download_n_results(query, 1)
1408 def _download_n_results(self, query, n):
1409 """Downloads a specified number of results for a query"""
1415 while (50 * pagenum) < limit:
1416 self.report_download_page(query, pagenum+1)
1417 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1418 request = compat_urllib_request.Request(result_url)
1420 data = compat_urllib_request.urlopen(request).read()
1421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1422 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1424 api_response = json.loads(data)['data']
1426 new_ids = list(video['id'] for video in api_response['items'])
1427 video_ids += new_ids
1429 limit = min(n, api_response['totalItems'])
1432 if len(video_ids) > n:
1433 video_ids = video_ids[:n]
1434 for id in video_ids:
1435 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1439 class GoogleSearchIE(InfoExtractor):
1440 """Information Extractor for Google Video search queries."""
1441 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1442 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1443 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1444 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1445 _max_google_results = 1000
1446 IE_NAME = u'video.google:search'
1448 def __init__(self, downloader=None):
1449 InfoExtractor.__init__(self, downloader)
1451 def report_download_page(self, query, pagenum):
1452 """Report attempt to download playlist page with given number."""
1453 query = query.decode(preferredencoding())
1454 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1456 def _real_extract(self, query):
1457 mobj = re.match(self._VALID_URL, query)
1459 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1462 prefix, query = query.split(':')
1464 query = query.encode('utf-8')
1466 self._download_n_results(query, 1)
1468 elif prefix == 'all':
1469 self._download_n_results(query, self._max_google_results)
1475 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1477 elif n > self._max_google_results:
1478 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1479 n = self._max_google_results
1480 self._download_n_results(query, n)
1482 except ValueError: # parsing prefix as integer fails
1483 self._download_n_results(query, 1)
1486 def _download_n_results(self, query, n):
1487 """Downloads a specified number of results for a query"""
1493 self.report_download_page(query, pagenum)
1494 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1495 request = compat_urllib_request.Request(result_url)
1497 page = compat_urllib_request.urlopen(request).read()
1498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1499 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1502 # Extract video identifiers
1503 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1504 video_id = mobj.group(1)
1505 if video_id not in video_ids:
1506 video_ids.append(video_id)
1507 if len(video_ids) == n:
1508 # Specified n videos reached
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1513 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1514 for id in video_ids:
1515 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518 pagenum = pagenum + 1
1521 class YahooSearchIE(InfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1525 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1526 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1527 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1528 _MORE_PAGES_INDICATOR = r'\s*Next'
1529 _max_yahoo_results = 1000
1530 IE_NAME = u'video.yahoo:search'
1532 def __init__(self, downloader=None):
1533 InfoExtractor.__init__(self, downloader)
1535 def report_download_page(self, query, pagenum):
1536 """Report attempt to download playlist page with given number."""
1537 query = query.decode(preferredencoding())
1538 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1540 def _real_extract(self, query):
1541 mobj = re.match(self._VALID_URL, query)
1543 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1546 prefix, query = query.split(':')
1548 query = query.encode('utf-8')
1550 self._download_n_results(query, 1)
1552 elif prefix == 'all':
1553 self._download_n_results(query, self._max_yahoo_results)
1559 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1561 elif n > self._max_yahoo_results:
1562 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1563 n = self._max_yahoo_results
1564 self._download_n_results(query, n)
1566 except ValueError: # parsing prefix as integer fails
1567 self._download_n_results(query, 1)
1570 def _download_n_results(self, query, n):
1571 """Downloads a specified number of results for a query"""
1574 already_seen = set()
1578 self.report_download_page(query, pagenum)
1579 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1580 request = compat_urllib_request.Request(result_url)
1582 page = compat_urllib_request.urlopen(request).read()
1583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1587 # Extract video identifiers
1588 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589 video_id = mobj.group(1)
1590 if video_id not in already_seen:
1591 video_ids.append(video_id)
1592 already_seen.add(video_id)
1593 if len(video_ids) == n:
1594 # Specified n videos reached
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1599 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600 for id in video_ids:
1601 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604 pagenum = pagenum + 1
1607 class YoutubePlaylistIE(InfoExtractor):
1608 """Information Extractor for YouTube playlists."""
1610 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1611 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1612 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1613 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1614 IE_NAME = u'youtube:playlist'
1616 def __init__(self, downloader=None):
1617 InfoExtractor.__init__(self, downloader)
1619 def report_download_page(self, playlist_id, pagenum):
1620 """Report attempt to download playlist page with given number."""
1621 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1623 def _real_extract(self, url):
1624 # Extract playlist id
1625 mobj = re.match(self._VALID_URL, url)
1627 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1631 if mobj.group(3) is not None:
1632 self._downloader.download([mobj.group(3)])
1635 # Download playlist pages
1636 # prefix is 'p' as default for playlists but there are other types that need extra care
1637 playlist_prefix = mobj.group(1)
1638 if playlist_prefix == 'a':
1639 playlist_access = 'artist'
1641 playlist_prefix = 'p'
1642 playlist_access = 'view_play_list'
1643 playlist_id = mobj.group(2)
1648 self.report_download_page(playlist_id, pagenum)
1649 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1650 request = compat_urllib_request.Request(url)
1652 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1654 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1657 # Extract video identifiers
1659 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1660 if mobj.group(1) not in ids_in_page:
1661 ids_in_page.append(mobj.group(1))
1662 video_ids.extend(ids_in_page)
1664 if self._MORE_PAGES_INDICATOR not in page:
1666 pagenum = pagenum + 1
1668 total = len(video_ids)
1670 playliststart = self._downloader.params.get('playliststart', 1) - 1
1671 playlistend = self._downloader.params.get('playlistend', -1)
1672 if playlistend == -1:
1673 video_ids = video_ids[playliststart:]
1675 video_ids = video_ids[playliststart:playlistend]
1677 if len(video_ids) == total:
1678 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1680 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1682 for id in video_ids:
1683 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1687 class YoutubeChannelIE(InfoExtractor):
1688 """Information Extractor for YouTube channels."""
1690 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1691 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1692 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1693 IE_NAME = u'youtube:channel'
1695 def report_download_page(self, channel_id, pagenum):
1696 """Report attempt to download channel page with given number."""
1697 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1699 def _real_extract(self, url):
1700 # Extract channel id
1701 mobj = re.match(self._VALID_URL, url)
1703 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1706 # Download channel pages
1707 channel_id = mobj.group(1)
1712 self.report_download_page(channel_id, pagenum)
1713 url = self._TEMPLATE_URL % (channel_id, pagenum)
1714 request = compat_urllib_request.Request(url)
1716 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1721 # Extract video identifiers
1723 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1724 if mobj.group(1) not in ids_in_page:
1725 ids_in_page.append(mobj.group(1))
1726 video_ids.extend(ids_in_page)
1728 if self._MORE_PAGES_INDICATOR not in page:
1730 pagenum = pagenum + 1
1732 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739 class YoutubeUserIE(InfoExtractor):
1740 """Information Extractor for YouTube users."""
1742 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1743 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1744 _GDATA_PAGE_SIZE = 50
1745 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1746 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1747 IE_NAME = u'youtube:user'
1749 def __init__(self, downloader=None):
1750 InfoExtractor.__init__(self, downloader)
1752 def report_download_page(self, username, start_index):
1753 """Report attempt to download user page."""
1754 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1755 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1757 def _real_extract(self, url):
1759 mobj = re.match(self._VALID_URL, url)
1761 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1764 username = mobj.group(1)
1766 # Download video ids using YouTube Data API. Result size per
1767 # query is limited (currently to 50 videos) so we need to query
1768 # page by page until there are no video ids - it means we got
1775 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1776 self.report_download_page(username, start_index)
1778 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1781 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1786 # Extract video identifiers
1789 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(mobj.group(1))
1793 video_ids.extend(ids_in_page)
1795 # A little optimization - if current page is not
1796 # "full", ie. does not contain PAGE_SIZE video ids then
1797 # we can assume that this page is the last one - there
1798 # are no more ids on further pages - no need to query
1801 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1806 all_ids_count = len(video_ids)
1807 playliststart = self._downloader.params.get('playliststart', 1) - 1
1808 playlistend = self._downloader.params.get('playlistend', -1)
1810 if playlistend == -1:
1811 video_ids = video_ids[playliststart:]
1813 video_ids = video_ids[playliststart:playlistend]
1815 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1816 (username, all_ids_count, len(video_ids)))
1818 for video_id in video_ids:
1819 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1822 class BlipTVUserIE(InfoExtractor):
1823 """Information Extractor for blip.tv users."""
1825 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1827 IE_NAME = u'blip.tv:user'
1829 def __init__(self, downloader=None):
1830 InfoExtractor.__init__(self, downloader)
1832 def report_download_page(self, username, pagenum):
1833 """Report attempt to download user page."""
1834 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1835 (self.IE_NAME, username, pagenum))
1837 def _real_extract(self, url):
1839 mobj = re.match(self._VALID_URL, url)
1841 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1844 username = mobj.group(1)
1846 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1848 request = compat_urllib_request.Request(url)
1851 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852 mobj = re.search(r'data-users-id="([^"]+)"', page)
1853 page_base = page_base % mobj.group(1)
1854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1859 # Download video ids using BlipTV Ajax calls. Result size per
1860 # query is limited (currently to 12 videos) so we need to query
1861 # page by page until there are no video ids - it means we got
1868 self.report_download_page(username, pagenum)
1870 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1873 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1878 # Extract video identifiers
1881 for mobj in re.finditer(r'href="/([^"]+)"', page):
1882 if mobj.group(1) not in ids_in_page:
1883 ids_in_page.append(unescapeHTML(mobj.group(1)))
1885 video_ids.extend(ids_in_page)
1887 # A little optimization - if current page is not
1888 # "full", ie. does not contain PAGE_SIZE video ids then
1889 # we can assume that this page is the last one - there
1890 # are no more ids on further pages - no need to query
1893 if len(ids_in_page) < self._PAGE_SIZE:
1898 all_ids_count = len(video_ids)
1899 playliststart = self._downloader.params.get('playliststart', 1) - 1
1900 playlistend = self._downloader.params.get('playlistend', -1)
1902 if playlistend == -1:
1903 video_ids = video_ids[playliststart:]
1905 video_ids = video_ids[playliststart:playlistend]
1907 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1908 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1910 for video_id in video_ids:
1911 self._downloader.download([u'http://blip.tv/'+video_id])
1914 class DepositFilesIE(InfoExtractor):
1915 """Information extractor for depositfiles.com"""
1917 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1919 def report_download_webpage(self, file_id):
1920 """Report webpage download."""
1921 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1923 def report_extraction(self, file_id):
1924 """Report information extraction."""
1925 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1927 def _real_extract(self, url):
1928 file_id = url.split('/')[-1]
1929 # Rebuild url in english locale
1930 url = 'http://depositfiles.com/en/files/' + file_id
1932 # Retrieve file webpage with 'Free download' button pressed
1933 free_download_indication = { 'gateway_result' : '1' }
1934 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1936 self.report_download_webpage(file_id)
1937 webpage = compat_urllib_request.urlopen(request).read()
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1942 # Search for the real file URL
1943 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1944 if (mobj is None) or (mobj.group(1) is None):
1945 # Try to figure out reason of the error.
1946 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1947 if (mobj is not None) and (mobj.group(1) is not None):
1948 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1949 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1951 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1954 file_url = mobj.group(1)
1955 file_extension = os.path.splitext(file_url)[1][1:]
1957 # Search for file title
1958 mobj = re.search(r'<b title="(.*?)">', webpage)
1960 self._downloader.trouble(u'ERROR: unable to extract title')
1962 file_title = mobj.group(1).decode('utf-8')
1965 'id': file_id.decode('utf-8'),
1966 'url': file_url.decode('utf-8'),
1968 'upload_date': None,
1969 'title': file_title,
1970 'ext': file_extension.decode('utf-8'),
1974 class FacebookIE(InfoExtractor):
1975 """Information Extractor for Facebook"""
1978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1979 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1980 _NETRC_MACHINE = 'facebook'
1981 _available_formats = ['video', 'highqual', 'lowqual']
1982 _video_extensions = {
1987 IE_NAME = u'facebook'
1989 def __init__(self, downloader=None):
1990 InfoExtractor.__init__(self, downloader)
1992 def _reporter(self, message):
1993 """Add header and report message."""
1994 self._downloader.to_screen(u'[facebook] %s' % message)
1996 def report_login(self):
1997 """Report attempt to log in."""
1998 self._reporter(u'Logging in')
2000 def report_video_webpage_download(self, video_id):
2001 """Report attempt to download video webpage."""
2002 self._reporter(u'%s: Downloading video webpage' % video_id)
2004 def report_information_extraction(self, video_id):
2005 """Report attempt to extract video information."""
2006 self._reporter(u'%s: Extracting video information' % video_id)
2008 def _parse_page(self, video_webpage):
2009 """Extract video information from page"""
2011 data = {'title': r'\("video_title", "(.*?)"\)',
2012 'description': r'<div class="datawrap">(.*?)</div>',
2013 'owner': r'\("video_owner_name", "(.*?)"\)',
2014 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2017 for piece in data.keys():
2018 mobj = re.search(data[piece], video_webpage)
2019 if mobj is not None:
2020 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024 for fmt in self._available_formats:
2025 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2026 if mobj is not None:
2027 # URL is in a Javascript segment inside an escaped Unicode format within
2028 # the generally utf-8 page
2029 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2030 video_info['video_urls'] = video_urls
2034 def _real_initialize(self):
2035 if self._downloader is None:
2040 downloader_params = self._downloader.params
2042 # Attempt to use provided username and password or .netrc data
2043 if downloader_params.get('username', None) is not None:
2044 useremail = downloader_params['username']
2045 password = downloader_params['password']
2046 elif downloader_params.get('usenetrc', False):
2048 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2049 if info is not None:
2053 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2054 except (IOError, netrc.NetrcParseError) as err:
2055 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2058 if useremail is None:
2067 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2070 login_results = compat_urllib_request.urlopen(request).read()
2071 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2072 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2075 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2078 def _real_extract(self, url):
2079 mobj = re.match(self._VALID_URL, url)
2081 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2083 video_id = mobj.group('ID')
2086 self.report_video_webpage_download(video_id)
2087 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2089 page = compat_urllib_request.urlopen(request)
2090 video_webpage = page.read()
2091 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2092 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2095 # Start extracting information
2096 self.report_information_extraction(video_id)
2098 # Extract information
2099 video_info = self._parse_page(video_webpage)
2102 if 'owner' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2105 video_uploader = video_info['owner']
2108 if 'title' not in video_info:
2109 self._downloader.trouble(u'ERROR: unable to extract video title')
2111 video_title = video_info['title']
2112 video_title = video_title.decode('utf-8')
2115 if 'thumbnail' not in video_info:
2116 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2117 video_thumbnail = ''
2119 video_thumbnail = video_info['thumbnail']
2123 if 'upload_date' in video_info:
2124 upload_time = video_info['upload_date']
2125 timetuple = email.utils.parsedate_tz(upload_time)
2126 if timetuple is not None:
2128 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2133 video_description = video_info.get('description', 'No description available.')
2135 url_map = video_info['video_urls']
2137 # Decide which formats to download
2138 req_format = self._downloader.params.get('format', None)
2139 format_limit = self._downloader.params.get('format_limit', None)
2141 if format_limit is not None and format_limit in self._available_formats:
2142 format_list = self._available_formats[self._available_formats.index(format_limit):]
2144 format_list = self._available_formats
2145 existing_formats = [x for x in format_list if x in url_map]
2146 if len(existing_formats) == 0:
2147 self._downloader.trouble(u'ERROR: no known formats available for video')
2149 if req_format is None:
2150 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2151 elif req_format == 'worst':
2152 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2153 elif req_format == '-1':
2154 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2157 if req_format not in url_map:
2158 self._downloader.trouble(u'ERROR: requested format not available')
2160 video_url_list = [(req_format, url_map[req_format])] # Specific format
2163 for format_param, video_real_url in video_url_list:
2165 video_extension = self._video_extensions.get(format_param, 'mp4')
2168 'id': video_id.decode('utf-8'),
2169 'url': video_real_url.decode('utf-8'),
2170 'uploader': video_uploader.decode('utf-8'),
2171 'upload_date': upload_date,
2172 'title': video_title,
2173 'ext': video_extension.decode('utf-8'),
2174 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2175 'thumbnail': video_thumbnail.decode('utf-8'),
2176 'description': video_description.decode('utf-8'),
2180 class BlipTVIE(InfoExtractor):
2181 """Information extractor for blip.tv"""
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2184 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2185 IE_NAME = u'blip.tv'
2187 def report_extraction(self, file_id):
2188 """Report information extraction."""
2189 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2191 def report_direct_download(self, title):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2195 def _real_extract(self, url):
2196 mobj = re.match(self._VALID_URL, url)
2198 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2205 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206 request = compat_urllib_request.Request(json_url)
2207 self.report_extraction(mobj.group(1))
2210 urlh = compat_urllib_request.urlopen(request)
2211 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2212 basename = url.split('/')[-1]
2213 title,ext = os.path.splitext(basename)
2214 title = title.decode('UTF-8')
2215 ext = ext.replace('.', '')
2216 self.report_direct_download(title)
2221 'upload_date': None,
2226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2227 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2229 if info is None: # Regular URL
2231 json_code_bytes = urlh.read()
2232 json_code = json_code_bytes.decode('utf-8')
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2238 json_data = json.loads(json_code)
2239 if 'Post' in json_data:
2240 data = json_data['Post']
2244 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245 video_url = data['media']['url']
2246 umobj = re.match(self._URL_EXT, video_url)
2248 raise ValueError('Can not determine filename extension')
2249 ext = umobj.group(1)
2252 'id': data['item_id'],
2254 'uploader': data['display_name'],
2255 'upload_date': upload_date,
2256 'title': data['title'],
2258 'format': data['media']['mimeType'],
2259 'thumbnail': data['thumbnailUrl'],
2260 'description': data['description'],
2261 'player_url': data['embedUrl']
2263 except (ValueError,KeyError) as err:
2264 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2267 std_headers['User-Agent'] = 'iTunes/10.6.1'
2271 class MyVideoIE(InfoExtractor):
2272 """Information Extractor for myvideo.de."""
2274 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275 IE_NAME = u'myvideo'
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2280 def report_extraction(self, video_id):
2281 """Report information extraction."""
2282 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2284 def _real_extract(self,url):
2285 mobj = re.match(self._VALID_URL, url)
2287 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2290 video_id = mobj.group(1)
2293 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2294 webpage = self._download_webpage(webpage_url, video_id)
2296 self.report_extraction(video_id)
2297 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2300 self._downloader.trouble(u'ERROR: unable to extract media URL')
2302 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2304 mobj = re.search('<title>([^<]+)</title>', webpage)
2306 self._downloader.trouble(u'ERROR: unable to extract title')
2309 video_title = mobj.group(1)
2315 'upload_date': None,
2316 'title': video_title,
2320 class ComedyCentralIE(InfoExtractor):
2321 """Information extractor for The Daily Show and Colbert Report """
2323 # urls can be abbreviations like :thedailyshow or :colbert
2324 # urls for episodes like:
2325 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2326 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2327 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2328 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2329 |(https?://)?(www\.)?
2330 (?P<showname>thedailyshow|colbertnation)\.com/
2331 (full-episodes/(?P<episode>.*)|
2333 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2334 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2337 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2339 _video_extensions = {
2347 _video_dimensions = {
2356 def suitable(self, url):
2357 """Receives a URL and returns True if suitable for this IE."""
2358 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2360 def report_extraction(self, episode_id):
2361 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2363 def report_config_download(self, episode_id, media_id):
2364 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2366 def report_index_download(self, episode_id):
2367 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2369 def _print_formats(self, formats):
2370 print('Available formats:')
2372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2375 def _real_extract(self, url):
2376 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2378 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2381 if mobj.group('shortname'):
2382 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2383 url = u'http://www.thedailyshow.com/full-episodes/'
2385 url = u'http://www.colbertnation.com/full-episodes/'
2386 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2387 assert mobj is not None
2389 if mobj.group('clip'):
2390 if mobj.group('showname') == 'thedailyshow':
2391 epTitle = mobj.group('tdstitle')
2393 epTitle = mobj.group('cntitle')
2396 dlNewest = not mobj.group('episode')
2398 epTitle = mobj.group('showname')
2400 epTitle = mobj.group('episode')
2402 req = compat_urllib_request.Request(url)
2403 self.report_extraction(epTitle)
2405 htmlHandle = compat_urllib_request.urlopen(req)
2406 html = htmlHandle.read()
2407 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2411 url = htmlHandle.geturl()
2412 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2416 if mobj.group('episode') == '':
2417 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2419 epTitle = mobj.group('episode')
2421 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2423 if len(mMovieParams) == 0:
2424 # The Colbert Report embeds the information in a without
2425 # a URL prefix; so extract the alternate reference
2426 # and then add the URL prefix manually.
2428 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2429 if len(altMovieParams) == 0:
2430 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2433 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2435 uri = mMovieParams[0][1]
2436 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2437 self.report_index_download(epTitle)
2439 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2446 idoc = xml.etree.ElementTree.fromstring(indexXml)
2447 itemEls = idoc.findall('.//item')
2448 for partNum,itemEl in enumerate(itemEls):
2449 mediaId = itemEl.findall('./guid')[0].text
2450 shortMediaId = mediaId.split(':')[-1]
2451 showId = mediaId.split(':')[-2].replace('.com', '')
2452 officialTitle = itemEl.findall('./title')[0].text
2453 officialDate = itemEl.findall('./pubDate')[0].text
2455 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2456 compat_urllib_parse.urlencode({'uri': mediaId}))
2457 configReq = compat_urllib_request.Request(configUrl)
2458 self.report_config_download(epTitle, shortMediaId)
2460 configXml = compat_urllib_request.urlopen(configReq).read()
2461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2462 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2465 cdoc = xml.etree.ElementTree.fromstring(configXml)
2467 for rendition in cdoc.findall('.//rendition'):
2468 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2472 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2475 if self._downloader.params.get('listformats', None):
2476 self._print_formats([i[0] for i in turls])
2479 # For now, just pick the highest bitrate
2480 format,rtmp_video_url = turls[-1]
2482 # Get the format arg from the arg stream
2483 req_format = self._downloader.params.get('format', None)
2485 # Select format if we can find one
2488 format, rtmp_video_url = f, v
2491 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2493 raise ExtractorError(u'Cannot transform RTMP url')
2494 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2495 video_url = base + m.group('finalid')
2497 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2502 'upload_date': officialDate,
2507 'description': officialTitle,
2509 results.append(info)
2514 class EscapistIE(InfoExtractor):
2515 """Information extractor for The Escapist """
2517 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2518 IE_NAME = u'escapist'
2520 def report_extraction(self, showName):
2521 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2523 def report_config_download(self, showName):
2524 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2526 def _real_extract(self, url):
2527 mobj = re.match(self._VALID_URL, url)
2529 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2531 showName = mobj.group('showname')
2532 videoId = mobj.group('episode')
2534 self.report_extraction(showName)
2536 webPage = compat_urllib_request.urlopen(url)
2537 webPageBytes = webPage.read()
2538 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2539 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2540 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2541 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2544 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2545 description = unescapeHTML(descMatch.group(1))
2546 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2547 imgUrl = unescapeHTML(imgMatch.group(1))
2548 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2549 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2550 configUrlMatch = re.search('config=(.*)$', playerUrl)
2551 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2553 self.report_config_download(showName)
2555 configJSON = compat_urllib_request.urlopen(configUrl)
2556 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2557 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2558 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2559 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2562 # Technically, it's JavaScript, not JSON
2563 configJSON = configJSON.replace("'", '"')
2566 config = json.loads(configJSON)
2567 except (ValueError,) as err:
2568 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2571 playlist = config['playlist']
2572 videoUrl = playlist[1]['url']
2577 'uploader': showName,
2578 'upload_date': None,
2581 'thumbnail': imgUrl,
2582 'description': description,
2583 'player_url': playerUrl,
2588 class CollegeHumorIE(InfoExtractor):
2589 """Information extractor for collegehumor.com"""
2592 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2593 IE_NAME = u'collegehumor'
2595 def report_manifest(self, video_id):
2596 """Report information extraction."""
2597 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2599 def report_extraction(self, video_id):
2600 """Report information extraction."""
2601 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2603 def _real_extract(self, url):
2604 mobj = re.match(self._VALID_URL, url)
2606 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2608 video_id = mobj.group('videoid')
2613 'upload_date': None,
2616 self.report_extraction(video_id)
2617 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2619 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2620 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2621 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2624 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2626 videoNode = mdoc.findall('./video')[0]
2627 info['description'] = videoNode.findall('./description')[0].text
2628 info['title'] = videoNode.findall('./caption')[0].text
2629 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2630 manifest_url = videoNode.findall('./file')[0].text
2632 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2635 manifest_url += '?hdcore=2.10.3'
2636 self.report_manifest(video_id)
2638 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2640 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2643 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2645 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2646 node_id = media_node.attrib['url']
2647 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2648 except IndexError as err:
2649 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2652 url_pr = compat_urllib_parse_urlparse(manifest_url)
2653 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2660 class XVideosIE(InfoExtractor):
2661 """Information extractor for xvideos.com"""
2663 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2664 IE_NAME = u'xvideos'
2666 def report_extraction(self, video_id):
2667 """Report information extraction."""
2668 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2670 def _real_extract(self, url):
2671 mobj = re.match(self._VALID_URL, url)
2673 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2675 video_id = mobj.group(1)
2677 webpage = self._download_webpage(url, video_id)
2679 self.report_extraction(video_id)
2683 mobj = re.search(r'flv_url=(.+?)&', webpage)
2685 self._downloader.trouble(u'ERROR: unable to extract video url')
2687 video_url = compat_urllib_parse.unquote(mobj.group(1))
2691 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2693 self._downloader.trouble(u'ERROR: unable to extract video title')
2695 video_title = mobj.group(1)
2698 # Extract video thumbnail
2699 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2701 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2703 video_thumbnail = mobj.group(0)
2709 'upload_date': None,
2710 'title': video_title,
2712 'thumbnail': video_thumbnail,
2713 'description': None,
2719 class SoundcloudIE(InfoExtractor):
2720 """Information extractor for soundcloud.com
2721 To access the media, the uid of the song and a stream token
2722 must be extracted from the page source and the script must make
2723 a request to media.soundcloud.com/crossdomain.xml. Then
2724 the media can be grabbed by requesting from an url composed
2725 of the stream token and uid
2728 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2729 IE_NAME = u'soundcloud'
2731 def __init__(self, downloader=None):
2732 InfoExtractor.__init__(self, downloader)
2734 def report_resolve(self, video_id):
2735 """Report information extraction."""
2736 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2738 def report_extraction(self, video_id):
2739 """Report information extraction."""
2740 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2742 def _real_extract(self, url):
2743 mobj = re.match(self._VALID_URL, url)
2745 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2748 # extract uploader (which is in the url)
2749 uploader = mobj.group(1)
2750 # extract simple title (uploader + slug of song title)
2751 slug_title = mobj.group(2)
2752 simple_title = uploader + u'-' + slug_title
2754 self.report_resolve('%s/%s' % (uploader, slug_title))
2756 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2757 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2758 request = compat_urllib_request.Request(resolv_url)
2760 info_json_bytes = compat_urllib_request.urlopen(request).read()
2761 info_json = info_json_bytes.decode('utf-8')
2762 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2763 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2766 info = json.loads(info_json)
2767 video_id = info['id']
2768 self.report_extraction('%s/%s' % (uploader, slug_title))
2770 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2771 request = compat_urllib_request.Request(streams_url)
2773 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2774 stream_json = stream_json_bytes.decode('utf-8')
2775 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2776 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2779 streams = json.loads(stream_json)
2780 mediaURL = streams['http_mp3_128_url']
2785 'uploader': info['user']['username'],
2786 'upload_date': info['created_at'],
2787 'title': info['title'],
2789 'description': info['description'],
2793 class InfoQIE(InfoExtractor):
2794 """Information extractor for infoq.com"""
2795 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2797 def report_extraction(self, video_id):
2798 """Report information extraction."""
2799 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2801 def _real_extract(self, url):
2802 mobj = re.match(self._VALID_URL, url)
2804 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2807 webpage = self._download_webpage(url, video_id=url)
2808 self.report_extraction(url)
2811 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2813 self._downloader.trouble(u'ERROR: unable to extract video url')
2815 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2816 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2819 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2821 self._downloader.trouble(u'ERROR: unable to extract video title')
2823 video_title = mobj.group(1)
2825 # Extract description
2826 video_description = u'No description available.'
2827 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2828 if mobj is not None:
2829 video_description = mobj.group(1)
2831 video_filename = video_url.split('/')[-1]
2832 video_id, extension = video_filename.split('.')
2838 'upload_date': None,
2839 'title': video_title,
2840 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2842 'description': video_description,
2847 class MixcloudIE(InfoExtractor):
2848 """Information extractor for www.mixcloud.com"""
2850 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2851 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2852 IE_NAME = u'mixcloud'
2854 def __init__(self, downloader=None):
2855 InfoExtractor.__init__(self, downloader)
2857 def report_download_json(self, file_id):
2858 """Report JSON download."""
2859 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2861 def report_extraction(self, file_id):
2862 """Report information extraction."""
2863 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2865 def get_urls(self, jsonData, fmt, bitrate='best'):
2866 """Get urls from 'audio_formats' section in json"""
2869 bitrate_list = jsonData[fmt]
2870 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2871 bitrate = max(bitrate_list) # select highest
2873 url_list = jsonData[fmt][bitrate]
2874 except TypeError: # we have no bitrate info.
2875 url_list = jsonData[fmt]
2878 def check_urls(self, url_list):
2879 """Returns 1st active url from list"""
2880 for url in url_list:
2882 compat_urllib_request.urlopen(url)
2884 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2889 def _print_formats(self, formats):
2890 print('Available formats:')
2891 for fmt in formats.keys():
2892 for b in formats[fmt]:
2894 ext = formats[fmt][b][0]
2895 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2896 except TypeError: # we have no bitrate info
2897 ext = formats[fmt][0]
2898 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2901 def _real_extract(self, url):
2902 mobj = re.match(self._VALID_URL, url)
2904 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2906 # extract uploader & filename from url
2907 uploader = mobj.group(1).decode('utf-8')
2908 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2910 # construct API request
2911 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2912 # retrieve .json file with links to files
2913 request = compat_urllib_request.Request(file_url)
2915 self.report_download_json(file_url)
2916 jsonData = compat_urllib_request.urlopen(request).read()
2917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2918 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2922 json_data = json.loads(jsonData)
2923 player_url = json_data['player_swf_url']
2924 formats = dict(json_data['audio_formats'])
2926 req_format = self._downloader.params.get('format', None)
2929 if self._downloader.params.get('listformats', None):
2930 self._print_formats(formats)
2933 if req_format is None or req_format == 'best':
2934 for format_param in formats.keys():
2935 url_list = self.get_urls(formats, format_param)
2937 file_url = self.check_urls(url_list)
2938 if file_url is not None:
2941 if req_format not in formats:
2942 self._downloader.trouble(u'ERROR: format is not available')
2945 url_list = self.get_urls(formats, req_format)
2946 file_url = self.check_urls(url_list)
2947 format_param = req_format
2950 'id': file_id.decode('utf-8'),
2951 'url': file_url.decode('utf-8'),
2952 'uploader': uploader.decode('utf-8'),
2953 'upload_date': None,
2954 'title': json_data['name'],
2955 'ext': file_url.split('.')[-1].decode('utf-8'),
2956 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2957 'thumbnail': json_data['thumbnail_url'],
2958 'description': json_data['description'],
2959 'player_url': player_url.decode('utf-8'),
2962 class StanfordOpenClassroomIE(InfoExtractor):
2963 """Information extractor for Stanford's Open ClassRoom"""
2965 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2966 IE_NAME = u'stanfordoc'
2968 def report_download_webpage(self, objid):
2969 """Report information extraction."""
2970 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2972 def report_extraction(self, video_id):
2973 """Report information extraction."""
2974 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2976 def _real_extract(self, url):
2977 mobj = re.match(self._VALID_URL, url)
2979 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2982 if mobj.group('course') and mobj.group('video'): # A specific video
2983 course = mobj.group('course')
2984 video = mobj.group('video')
2986 'id': course + '_' + video,
2988 'upload_date': None,
2991 self.report_extraction(info['id'])
2992 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2993 xmlUrl = baseUrl + video + '.xml'
2995 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2996 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2997 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2999 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3001 info['title'] = mdoc.findall('./title')[0].text
3002 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3004 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3006 info['ext'] = info['url'].rpartition('.')[2]
3008 elif mobj.group('course'): # A course page
3009 course = mobj.group('course')
3014 'upload_date': None,
3017 self.report_download_webpage(info['id'])
3019 coursepage = compat_urllib_request.urlopen(url).read()
3020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3021 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3024 m = re.search('<h1>([^<]+)</h1>', coursepage)
3026 info['title'] = unescapeHTML(m.group(1))
3028 info['title'] = info['id']
3030 m = re.search('<description>([^<]+)</description>', coursepage)
3032 info['description'] = unescapeHTML(m.group(1))
3034 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3037 'type': 'reference',
3038 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3042 for entry in info['list']:
3043 assert entry['type'] == 'reference'
3044 results += self.extract(entry['url'])
3049 'id': 'Stanford OpenClassroom',
3052 'upload_date': None,
3055 self.report_download_webpage(info['id'])
3056 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3058 rootpage = compat_urllib_request.urlopen(rootURL).read()
3059 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3060 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3063 info['title'] = info['id']
3065 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3068 'type': 'reference',
3069 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3074 for entry in info['list']:
3075 assert entry['type'] == 'reference'
3076 results += self.extract(entry['url'])
3079 class MTVIE(InfoExtractor):
3080 """Information extractor for MTV.com"""
3082 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3085 def report_extraction(self, video_id):
3086 """Report information extraction."""
3087 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3089 def _real_extract(self, url):
3090 mobj = re.match(self._VALID_URL, url)
3092 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3094 if not mobj.group('proto'):
3095 url = 'http://' + url
3096 video_id = mobj.group('videoid')
3098 webpage = self._download_webpage(url, video_id)
3100 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3102 self._downloader.trouble(u'ERROR: unable to extract song name')
3104 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3105 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3107 self._downloader.trouble(u'ERROR: unable to extract performer')
3109 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3110 video_title = performer + ' - ' + song_name
3112 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3114 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3116 mtvn_uri = mobj.group(1)
3118 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3120 self._downloader.trouble(u'ERROR: unable to extract content id')
3122 content_id = mobj.group(1)
3124 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3125 self.report_extraction(video_id)
3126 request = compat_urllib_request.Request(videogen_url)
3128 metadataXml = compat_urllib_request.urlopen(request).read()
3129 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3130 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3133 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3134 renditions = mdoc.findall('.//rendition')
3136 # For now, always pick the highest quality.
3137 rendition = renditions[-1]
3140 _,_,ext = rendition.attrib['type'].partition('/')
3141 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3142 video_url = rendition.find('./src').text
3144 self._downloader.trouble('Invalid rendition field.')
3150 'uploader': performer,
3151 'upload_date': None,
3152 'title': video_title,
3160 class YoukuIE(InfoExtractor):
3161 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3163 def report_download_webpage(self, file_id):
3164 """Report webpage download."""
3165 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3167 def report_extraction(self, file_id):
3168 """Report information extraction."""
3169 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3172 nowTime = int(time.time() * 1000)
3173 random1 = random.randint(1000,1998)
3174 random2 = random.randint(1000,9999)
3176 return "%d%d%d" %(nowTime,random1,random2)
3178 def _get_file_ID_mix_string(self, seed):
3180 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3182 for i in range(len(source)):
3183 seed = (seed * 211 + 30031 ) % 65536
3184 index = math.floor(seed / 65536 * len(source) )
3185 mixed.append(source[int(index)])
3186 source.remove(source[int(index)])
3187 #return ''.join(mixed)
3190 def _get_file_id(self, fileId, seed):
3191 mixed = self._get_file_ID_mix_string(seed)
3192 ids = fileId.split('*')
3196 realId.append(mixed[int(ch)])
3197 return ''.join(realId)
3199 def _real_extract(self, url):
3200 mobj = re.match(self._VALID_URL, url)
3202 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3204 video_id = mobj.group('ID')
3206 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3208 request = compat_urllib_request.Request(info_url, None, std_headers)
3210 self.report_download_webpage(video_id)
3211 jsondata = compat_urllib_request.urlopen(request).read()
3212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3213 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3216 self.report_extraction(video_id)
3218 jsonstr = jsondata.decode('utf-8')
3219 config = json.loads(jsonstr)
3221 video_title = config['data'][0]['title']
3222 seed = config['data'][0]['seed']
3224 format = self._downloader.params.get('format', None)
3225 supported_format = list(config['data'][0]['streamfileids'].keys())
3227 if format is None or format == 'best':
3228 if 'hd2' in supported_format:
3233 elif format == 'worst':
3241 fileid = config['data'][0]['streamfileids'][format]
3242 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3243 except (UnicodeDecodeError, ValueError, KeyError):
3244 self._downloader.trouble(u'ERROR: unable to extract info section')
3248 sid = self._gen_sid()
3249 fileid = self._get_file_id(fileid, seed)
3251 #column 8,9 of fileid represent the segment number
3252 #fileid[7:9] should be changed
3253 for index, key in enumerate(keys):
3255 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3256 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3259 'id': '%s_part%02d' % (video_id, index),
3260 'url': download_url,
3262 'upload_date': None,
3263 'title': video_title,
3266 files_info.append(info)
3271 class XNXXIE(InfoExtractor):
3272 """Information extractor for xnxx.com"""
3274 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3276 VIDEO_URL_RE = r'flv_url=(.*?)&'
3277 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3278 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3280 def report_webpage(self, video_id):
3281 """Report information extraction"""
3282 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3284 def report_extraction(self, video_id):
3285 """Report information extraction"""
3286 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3288 def _real_extract(self, url):
3289 mobj = re.match(self._VALID_URL, url)
3291 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3293 video_id = mobj.group(1)
3295 self.report_webpage(video_id)
3297 # Get webpage content
3299 webpage_bytes = compat_urllib_request.urlopen(url).read()
3300 webpage = webpage_bytes.decode('utf-8')
3301 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3302 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3305 result = re.search(self.VIDEO_URL_RE, webpage)
3307 self._downloader.trouble(u'ERROR: unable to extract video url')
3309 video_url = compat_urllib_parse.unquote(result.group(1))
3311 result = re.search(self.VIDEO_TITLE_RE, webpage)
3313 self._downloader.trouble(u'ERROR: unable to extract video title')
3315 video_title = result.group(1)
3317 result = re.search(self.VIDEO_THUMB_RE, webpage)
3319 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3321 video_thumbnail = result.group(1)
3327 'upload_date': None,
3328 'title': video_title,
3330 'thumbnail': video_thumbnail,
3331 'description': None,
3335 class GooglePlusIE(InfoExtractor):
3336 """Information extractor for plus.google.com."""
3338 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3339 IE_NAME = u'plus.google'
3341 def __init__(self, downloader=None):
3342 InfoExtractor.__init__(self, downloader)
3344 def report_extract_entry(self, url):
3345 """Report downloading extry"""
3346 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3348 def report_date(self, upload_date):
3349 """Report downloading extry"""
3350 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3352 def report_uploader(self, uploader):
3353 """Report downloading extry"""
3354 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3356 def report_title(self, video_title):
3357 """Report downloading extry"""
3358 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3360 def report_extract_vid_page(self, video_page):
3361 """Report information extraction."""
3362 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3364 def _real_extract(self, url):
3365 # Extract id from URL
3366 mobj = re.match(self._VALID_URL, url)
3368 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3371 post_url = mobj.group(0)
3372 video_id = mobj.group(1)
3374 video_extension = 'flv'
3376 # Step 1, Retrieve post webpage to extract further information
3377 self.report_extract_entry(post_url)
3378 request = compat_urllib_request.Request(post_url)
3380 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3381 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3382 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3385 # Extract update date
3387 pattern = 'title="Timestamp">(.*?)</a>'
3388 mobj = re.search(pattern, webpage)
3390 upload_date = mobj.group(1)
3391 # Convert timestring to a format suitable for filename
3392 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3393 upload_date = upload_date.strftime('%Y%m%d')
3394 self.report_date(upload_date)
3398 pattern = r'rel\="author".*?>(.*?)</a>'
3399 mobj = re.search(pattern, webpage)
3401 uploader = mobj.group(1)
3402 self.report_uploader(uploader)
3405 # Get the first line for title
3407 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3408 mobj = re.search(pattern, webpage)
3410 video_title = mobj.group(1)
3411 self.report_title(video_title)
3413 # Step 2, Stimulate clicking the image box to launch video
3414 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3415 mobj = re.search(pattern, webpage)
3417 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3419 video_page = mobj.group(1)
3420 request = compat_urllib_request.Request(video_page)
3422 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3423 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3424 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3426 self.report_extract_vid_page(video_page)
3429 # Extract video links on video page
3430 """Extract video links of all sizes"""
3431 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3432 mobj = re.findall(pattern, webpage)
3434 self._downloader.trouble(u'ERROR: unable to extract video links')
3436 # Sort in resolution
3437 links = sorted(mobj)
3439 # Choose the lowest of the sort, i.e. highest resolution
3440 video_url = links[-1]
3441 # Only get the url. The resolution part in the tuple has no use anymore
3442 video_url = video_url[-1]
3443 # Treat escaped \u0026 style hex
3445 video_url = video_url.decode("unicode_escape")
3446 except AttributeError: # Python 3
3447 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3453 'uploader': uploader,
3454 'upload_date': upload_date,
3455 'title': video_title,
3456 'ext': video_extension,
3459 class NBAIE(InfoExtractor):
3460 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3463 def _real_extract(self, url):
3464 mobj = re.match(self._VALID_URL, url)
3466 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3469 video_id = mobj.group(1)
3470 if video_id.endswith('/index.html'):
3471 video_id = video_id[:-len('/index.html')]
3473 webpage = self._download_webpage(url, video_id)
3475 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3476 def _findProp(rexp, default=None):
3477 m = re.search(rexp, webpage)
3479 return unescapeHTML(m.group(1))
3483 shortened_video_id = video_id.rpartition('/')[2]
3484 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3486 'id': shortened_video_id,
3490 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3491 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3495 class JustinTVIE(InfoExtractor):
3496 """Information extractor for justin.tv and twitch.tv"""
3497 # TODO: One broadcast may be split into multiple videos. The key
3498 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3499 # starts at 1 and increases. Can we treat all parts as one video?
3501 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3502 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3503 _JUSTIN_PAGE_LIMIT = 100
3504 IE_NAME = u'justin.tv'
3506 def report_extraction(self, file_id):
3507 """Report information extraction."""
3508 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3510 def report_download_page(self, channel, offset):
3511 """Report attempt to download a single page of videos."""
3512 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3513 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3515 # Return count of items, list of *valid* items
3516 def _parse_page(self, url):
3518 urlh = compat_urllib_request.urlopen(url)
3519 webpage_bytes = urlh.read()
3520 webpage = webpage_bytes.decode('utf-8', 'ignore')
3521 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3522 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3525 response = json.loads(webpage)
3527 for clip in response:
3528 video_url = clip['video_file_url']
3530 video_extension = os.path.splitext(video_url)[1][1:]
3531 video_date = re.sub('-', '', clip['created_on'][:10])
3535 'title': clip['title'],
3536 'uploader': clip.get('user_id', clip.get('channel_id')),
3537 'upload_date': video_date,
3538 'ext': video_extension,
3540 return (len(response), info)
3542 def _real_extract(self, url):
3543 mobj = re.match(self._VALID_URL, url)
3545 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3548 api = 'http://api.justin.tv'
3549 video_id = mobj.group(mobj.lastindex)
3551 if mobj.lastindex == 1:
3553 api += '/channel/archives/%s.json'
3555 api += '/clip/show/%s.json'
3556 api = api % (video_id,)
3558 self.report_extraction(video_id)
3562 limit = self._JUSTIN_PAGE_LIMIT
3565 self.report_download_page(video_id, offset)
3566 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3567 page_count, page_info = self._parse_page(page_url)
3568 info.extend(page_info)
3569 if not paged or page_count != limit:
3574 class FunnyOrDieIE(InfoExtractor):
3575 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3577 def _real_extract(self, url):
3578 mobj = re.match(self._VALID_URL, url)
3580 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3583 video_id = mobj.group('id')
3584 webpage = self._download_webpage(url, video_id)
3586 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3588 self._downloader.trouble(u'ERROR: unable to find video information')
3589 video_url = unescapeHTML(m.group('url'))
3591 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3593 self._downloader.trouble(u'Cannot find video title')
3594 title = unescapeHTML(m.group('title'))
3596 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3598 desc = unescapeHTML(m.group('desc'))
3607 'description': desc,
3611 class TweetReelIE(InfoExtractor):
3612 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3614 def _real_extract(self, url):
3615 mobj = re.match(self._VALID_URL, url)
3617 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3620 video_id = mobj.group('id')
3621 webpage = self._download_webpage(url, video_id)
3623 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3625 self._downloader.trouble(u'ERROR: Cannot find status ID')
3626 status_id = m.group(1)
3628 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3630 self._downloader.trouble(u'WARNING: Cannot find description')
3631 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3633 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3635 self._downloader.trouble(u'ERROR: Cannot find uploader')
3636 uploader = unescapeHTML(m.group('uploader'))
3637 uploader_id = unescapeHTML(m.group('uploader_id'))
3639 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3641 self._downloader.trouble(u'ERROR: Cannot find upload date')
3642 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3645 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3652 'description': desc,
3653 'uploader': uploader,
3654 'uploader_id': uploader_id,
3655 'internal_id': status_id,
3656 'upload_date': upload_date
3660 class SteamIE(InfoExtractor):
3661 _VALID_URL = r"""http://store.steampowered.com/
3662 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3664 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3667 def suitable(self, url):
3668 """Receives a URL and returns True if suitable for this IE."""
3669 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3671 def _real_extract(self, url):
3672 m = re.match(self._VALID_URL, url, re.VERBOSE)
3673 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3674 gameID = m.group('gameID')
3675 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3676 webpage = self._download_webpage(videourl, gameID)
3677 mweb = re.finditer(urlRE, webpage)
3678 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3679 titles = re.finditer(namesRE, webpage)
3681 for vid,vtitle in zip(mweb,titles):
3682 video_id = vid.group('videoID')
3683 title = vtitle.group('videoName')
3684 video_url = vid.group('videoURL')
3686 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3691 'title': unescapeHTML(title)
3696 class UstreamIE(InfoExtractor):
3697 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3698 IE_NAME = u'ustream'
3700 def _real_extract(self, url):
3701 m = re.match(self._VALID_URL, url)
3702 video_id = m.group('videoID')
3703 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3704 webpage = self._download_webpage(url, video_id)
3705 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3706 title = m.group('title')
3707 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3708 uploader = m.group('uploader')
3714 'uploader': uploader
3719 def gen_extractors():
3720 """ Return a list of an instance of every supported extractor.
3721 The order does matter; the first extractor matched is the one handling the URL.
3724 YoutubePlaylistIE(),
3748 StanfordOpenClassroomIE(),