2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
70 self.set_downloader(downloader)
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _print_formats(self, formats):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
227 def _real_initialize(self):
228 if self._downloader is None:
233 downloader_params = self._downloader.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
252 request = compat_urllib_request.Request(self._LANG_URL)
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
275 login_results = compat_urllib_request.urlopen(request).read()
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
286 'action_confirm': 'Confirm',
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read()
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
307 video_id = mobj.group(2)
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
354 # Start extracting information
355 self.report_information_extraction(video_id)
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
393 video_description = ''
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
402 srt_list = compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
419 srt_xml = compat_urllib_request.urlopen(request).read()
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
425 except Trouble as trouble:
426 self._downloader.trouble(trouble[0])
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
475 video_url_list = [(rf, url_map[rf])]
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
485 for format_param, video_real_url in video_url_list:
487 video_extension = self._video_extensions.get(format_param, 'flv')
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
549 'submit': "Continue - I'm over 18",
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
566 video_id = mobj.group(1)
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
616 self._downloader.trouble(u'ERROR: unable to extract title')
618 video_title = mobj.group(1).decode('utf-8')
620 mobj = re.search(r'submitter=(.*?);', webpage)
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
624 video_uploader = mobj.group(1)
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
662 video_extension = 'mp4'
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
668 self.report_download_webpage(video_id)
669 webpage_bytes = compat_urllib_request.urlopen(request).read()
670 webpage = webpage_bytes.decode('utf-8')
671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
675 # Extract URL, uploader and title from webpage
676 self.report_extraction(video_id)
677 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 flashvars = compat_urllib_parse.unquote(mobj.group(1))
683 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
686 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
692 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
694 self._downloader.trouble(u'ERROR: unable to extract video URL')
697 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
699 # TODO: support choosing qualities
701 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
703 self._downloader.trouble(u'ERROR: unable to extract title')
705 video_title = unescapeHTML(mobj.group('title'))
707 video_uploader = None
708 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
710 # lookin for official user
711 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712 if mobj_official is None:
713 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
715 video_uploader = mobj_official.group(1)
717 video_uploader = mobj.group(1)
719 video_upload_date = None
720 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
722 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
727 'uploader': video_uploader,
728 'upload_date': video_upload_date,
729 'title': video_title,
730 'ext': video_extension,
734 class GoogleIE(InfoExtractor):
735 """Information extractor for video.google.com."""
737 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
738 IE_NAME = u'video.google'
740 def __init__(self, downloader=None):
741 InfoExtractor.__init__(self, downloader)
743 def report_download_webpage(self, video_id):
744 """Report webpage download."""
745 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
747 def report_extraction(self, video_id):
748 """Report information extraction."""
749 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
751 def _real_extract(self, url):
752 # Extract id from URL
753 mobj = re.match(self._VALID_URL, url)
755 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
758 video_id = mobj.group(1)
760 video_extension = 'mp4'
762 # Retrieve video webpage to extract further information
763 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
765 self.report_download_webpage(video_id)
766 webpage = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
771 # Extract URL, uploader, and title from webpage
772 self.report_extraction(video_id)
773 mobj = re.search(r"download_url:'([^']+)'", webpage)
775 video_extension = 'flv'
776 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
778 self._downloader.trouble(u'ERROR: unable to extract media URL')
780 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781 mediaURL = mediaURL.replace('\\x3d', '\x3d')
782 mediaURL = mediaURL.replace('\\x26', '\x26')
786 mobj = re.search(r'<title>(.*)</title>', webpage)
788 self._downloader.trouble(u'ERROR: unable to extract title')
790 video_title = mobj.group(1).decode('utf-8')
792 # Extract video description
793 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
795 self._downloader.trouble(u'ERROR: unable to extract video description')
797 video_description = mobj.group(1).decode('utf-8')
798 if not video_description:
799 video_description = 'No description available.'
801 # Extract video thumbnail
802 if self._downloader.params.get('forcethumbnail', False):
803 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
805 webpage = compat_urllib_request.urlopen(request).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
809 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
811 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
813 video_thumbnail = mobj.group(1)
814 else: # we need something to pass to process_info
818 'id': video_id.decode('utf-8'),
819 'url': video_url.decode('utf-8'),
822 'title': video_title,
823 'ext': video_extension.decode('utf-8'),
827 class PhotobucketIE(InfoExtractor):
828 """Information extractor for photobucket.com."""
830 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831 IE_NAME = u'photobucket'
833 def __init__(self, downloader=None):
834 InfoExtractor.__init__(self, downloader)
836 def report_download_webpage(self, video_id):
837 """Report webpage download."""
838 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
840 def report_extraction(self, video_id):
841 """Report information extraction."""
842 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
844 def _real_extract(self, url):
845 # Extract id from URL
846 mobj = re.match(self._VALID_URL, url)
848 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
851 video_id = mobj.group(1)
853 video_extension = 'flv'
855 # Retrieve video webpage to extract further information
856 request = compat_urllib_request.Request(url)
858 self.report_download_webpage(video_id)
859 webpage = compat_urllib_request.urlopen(request).read()
860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
864 # Extract URL, uploader, and title from webpage
865 self.report_extraction(video_id)
866 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
868 self._downloader.trouble(u'ERROR: unable to extract media URL')
870 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
874 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
876 self._downloader.trouble(u'ERROR: unable to extract title')
878 video_title = mobj.group(1).decode('utf-8')
880 video_uploader = mobj.group(2).decode('utf-8')
883 'id': video_id.decode('utf-8'),
884 'url': video_url.decode('utf-8'),
885 'uploader': video_uploader,
887 'title': video_title,
888 'ext': video_extension.decode('utf-8'),
892 class YahooIE(InfoExtractor):
893 """Information extractor for video.yahoo.com."""
895 # _VALID_URL matches all Yahoo! Video URLs
896 # _VPAGE_URL matches only the extractable '/watch/' URLs
897 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
898 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
899 IE_NAME = u'video.yahoo'
901 def __init__(self, downloader=None):
902 InfoExtractor.__init__(self, downloader)
904 def report_download_webpage(self, video_id):
905 """Report webpage download."""
906 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
908 def report_extraction(self, video_id):
909 """Report information extraction."""
910 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
912 def _real_extract(self, url, new_video=True):
913 # Extract ID from URL
914 mobj = re.match(self._VALID_URL, url)
916 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
919 video_id = mobj.group(2)
920 video_extension = 'flv'
922 # Rewrite valid but non-extractable URLs as
923 # extractable English language /watch/ URLs
924 if re.match(self._VPAGE_URL, url) is None:
925 request = compat_urllib_request.Request(url)
927 webpage = compat_urllib_request.urlopen(request).read()
928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
929 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
932 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
934 self._downloader.trouble(u'ERROR: Unable to extract id field')
936 yahoo_id = mobj.group(1)
938 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
940 self._downloader.trouble(u'ERROR: Unable to extract vid field')
942 yahoo_vid = mobj.group(1)
944 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
945 return self._real_extract(url, new_video=False)
947 # Retrieve video webpage to extract further information
948 request = compat_urllib_request.Request(url)
950 self.report_download_webpage(video_id)
951 webpage = compat_urllib_request.urlopen(request).read()
952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
953 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
956 # Extract uploader and title from webpage
957 self.report_extraction(video_id)
958 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
960 self._downloader.trouble(u'ERROR: unable to extract video title')
962 video_title = mobj.group(1).decode('utf-8')
964 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
966 self._downloader.trouble(u'ERROR: unable to extract video uploader')
968 video_uploader = mobj.group(1).decode('utf-8')
970 # Extract video thumbnail
971 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
973 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
975 video_thumbnail = mobj.group(1).decode('utf-8')
977 # Extract video description
978 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
980 self._downloader.trouble(u'ERROR: unable to extract video description')
982 video_description = mobj.group(1).decode('utf-8')
983 if not video_description:
984 video_description = 'No description available.'
986 # Extract video height and width
987 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
989 self._downloader.trouble(u'ERROR: unable to extract video height')
991 yv_video_height = mobj.group(1)
993 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
995 self._downloader.trouble(u'ERROR: unable to extract video width')
997 yv_video_width = mobj.group(1)
999 # Retrieve video playlist to extract media URL
1000 # I'm not completely sure what all these options are, but we
1001 # seem to need most of them, otherwise the server sends a 401.
1002 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1003 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1004 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1005 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1006 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1008 self.report_download_webpage(video_id)
1009 webpage = compat_urllib_request.urlopen(request).read()
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1014 # Extract media URL from playlist XML
1015 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1017 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1019 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1020 video_url = unescapeHTML(video_url)
1023 'id': video_id.decode('utf-8'),
1025 'uploader': video_uploader,
1026 'upload_date': None,
1027 'title': video_title,
1028 'ext': video_extension.decode('utf-8'),
1029 'thumbnail': video_thumbnail.decode('utf-8'),
1030 'description': video_description,
1034 class VimeoIE(InfoExtractor):
1035 """Information extractor for vimeo.com."""
1037 # _VALID_URL matches Vimeo URLs
1038 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1041 def __init__(self, downloader=None):
1042 InfoExtractor.__init__(self, downloader)
1044 def report_download_webpage(self, video_id):
1045 """Report webpage download."""
1046 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1048 def report_extraction(self, video_id):
1049 """Report information extraction."""
1050 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1052 def _real_extract(self, url, new_video=True):
1053 # Extract ID from URL
1054 mobj = re.match(self._VALID_URL, url)
1056 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1059 video_id = mobj.group(1)
1061 # Retrieve video webpage to extract further information
1062 request = compat_urllib_request.Request(url, None, std_headers)
1064 self.report_download_webpage(video_id)
1065 webpage = compat_urllib_request.urlopen(request).read()
1066 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1067 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1070 # Now we begin extracting as much information as we can from what we
1071 # retrieved. First we extract the information common to all extractors,
1072 # and latter we extract those that are Vimeo specific.
1073 self.report_extraction(video_id)
1075 # Extract the config JSON
1077 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1078 config = json.loads(config)
1080 self._downloader.trouble(u'ERROR: unable to extract info section')
1084 video_title = config["video"]["title"]
1087 video_uploader = config["video"]["owner"]["name"]
1089 # Extract video thumbnail
1090 video_thumbnail = config["video"]["thumbnail"]
1092 # Extract video description
1093 video_description = get_element_by_id("description", webpage.decode('utf8'))
1094 if video_description: video_description = clean_html(video_description)
1095 else: video_description = ''
1097 # Extract upload date
1098 video_upload_date = None
1099 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1100 if mobj is not None:
1101 video_upload_date = mobj.group(1)
1103 # Vimeo specific: extract request signature and timestamp
1104 sig = config['request']['signature']
1105 timestamp = config['request']['timestamp']
1107 # Vimeo specific: extract video codec and quality information
1108 # First consider quality, then codecs, then take everything
1109 # TODO bind to format param
1110 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1111 files = { 'hd': [], 'sd': [], 'other': []}
1112 for codec_name, codec_extension in codecs:
1113 if codec_name in config["video"]["files"]:
1114 if 'hd' in config["video"]["files"][codec_name]:
1115 files['hd'].append((codec_name, codec_extension, 'hd'))
1116 elif 'sd' in config["video"]["files"][codec_name]:
1117 files['sd'].append((codec_name, codec_extension, 'sd'))
1119 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1121 for quality in ('hd', 'sd', 'other'):
1122 if len(files[quality]) > 0:
1123 video_quality = files[quality][0][2]
1124 video_codec = files[quality][0][0]
1125 video_extension = files[quality][0][1]
1126 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1129 self._downloader.trouble(u'ERROR: no known codec found')
1132 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1133 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1138 'uploader': video_uploader,
1139 'upload_date': video_upload_date,
1140 'title': video_title,
1141 'ext': video_extension,
1142 'thumbnail': video_thumbnail,
1143 'description': video_description,
1147 class ArteTvIE(InfoExtractor):
1148 """arte.tv information extractor."""
1150 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1151 _LIVE_URL = r'index-[0-9]+\.html$'
1153 IE_NAME = u'arte.tv'
1155 def __init__(self, downloader=None):
1156 InfoExtractor.__init__(self, downloader)
1158 def report_download_webpage(self, video_id):
1159 """Report webpage download."""
1160 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1162 def report_extraction(self, video_id):
1163 """Report information extraction."""
1164 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1166 def fetch_webpage(self, url):
1167 self._downloader.increment_downloads()
1168 request = compat_urllib_request.Request(url)
1170 self.report_download_webpage(url)
1171 webpage = compat_urllib_request.urlopen(request).read()
1172 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1173 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1175 except ValueError as err:
1176 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1180 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1181 page = self.fetch_webpage(url)
1182 mobj = re.search(regex, page, regexFlags)
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189 for (i, key, err) in matchTuples:
1190 if mobj.group(i) is None:
1191 self._downloader.trouble(err)
1194 info[key] = mobj.group(i)
1198 def extractLiveStream(self, url):
1199 video_lang = url.split('/')[-4]
1200 info = self.grep_webpage(
1202 r'src="(.*?/videothek_js.*?\.js)',
1205 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1208 http_host = url.split('/')[2]
1209 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1210 info = self.grep_webpage(
1212 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1213 '(http://.*?\.swf).*?' +
1217 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1218 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1219 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1222 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1224 def extractPlus7Stream(self, url):
1225 video_lang = url.split('/')[-3]
1226 info = self.grep_webpage(
1228 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1231 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1234 next_url = compat_urllib_parse.unquote(info.get('url'))
1235 info = self.grep_webpage(
1237 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1240 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1243 next_url = compat_urllib_parse.unquote(info.get('url'))
1245 info = self.grep_webpage(
1247 r'<video id="(.*?)".*?>.*?' +
1248 '<name>(.*?)</name>.*?' +
1249 '<dateVideo>(.*?)</dateVideo>.*?' +
1250 '<url quality="hd">(.*?)</url>',
1253 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1254 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1255 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1256 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1261 'id': info.get('id'),
1262 'url': compat_urllib_parse.unquote(info.get('url')),
1263 'uploader': u'arte.tv',
1264 'upload_date': info.get('date'),
1265 'title': info.get('title'),
1271 def _real_extract(self, url):
1272 video_id = url.split('/')[-1]
1273 self.report_extraction(video_id)
1275 if re.search(self._LIVE_URL, video_id) is not None:
1276 self.extractLiveStream(url)
1279 info = self.extractPlus7Stream(url)
1284 class GenericIE(InfoExtractor):
1285 """Generic last-resort information extractor."""
1288 IE_NAME = u'generic'
1290 def __init__(self, downloader=None):
1291 InfoExtractor.__init__(self, downloader)
1293 def report_download_webpage(self, video_id):
1294 """Report webpage download."""
1295 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1296 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1298 def report_extraction(self, video_id):
1299 """Report information extraction."""
1300 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1302 def report_following_redirect(self, new_url):
1303 """Report information extraction."""
1304 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1306 def _test_redirect(self, url):
1307 """Check if it is a redirect, like url shorteners, in case restart chain."""
1308 class HeadRequest(compat_urllib_request.Request):
1309 def get_method(self):
1312 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1314 Subclass the HTTPRedirectHandler to make it use our
1315 HeadRequest also on the redirected URL
1317 def redirect_request(self, req, fp, code, msg, headers, newurl):
1318 if code in (301, 302, 303, 307):
1319 newurl = newurl.replace(' ', '%20')
1320 newheaders = dict((k,v) for k,v in req.headers.items()
1321 if k.lower() not in ("content-length", "content-type"))
1322 return HeadRequest(newurl,
1324 origin_req_host=req.get_origin_req_host(),
1327 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1329 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1331 Fallback to GET if HEAD is not allowed (405 HTTP error)
1333 def http_error_405(self, req, fp, code, msg, headers):
1337 newheaders = dict((k,v) for k,v in req.headers.items()
1338 if k.lower() not in ("content-length", "content-type"))
1339 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1341 origin_req_host=req.get_origin_req_host(),
1345 opener = compat_urllib_request.OpenerDirector()
1346 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1347 HTTPMethodFallback, HEADRedirectHandler,
1348 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1349 opener.add_handler(handler())
1351 response = opener.open(HeadRequest(url))
1352 new_url = response.geturl()
1357 self.report_following_redirect(new_url)
1358 self._downloader.download([new_url])
1361 def _real_extract(self, url):
1362 if self._test_redirect(url): return
1364 video_id = url.split('/')[-1]
1365 request = compat_urllib_request.Request(url)
1367 self.report_download_webpage(video_id)
1368 webpage = compat_urllib_request.urlopen(request).read()
1369 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1370 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1372 except ValueError as err:
1373 # since this is the last-resort InfoExtractor, if
1374 # this error is thrown, it'll be thrown here
1375 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1378 self.report_extraction(video_id)
1379 # Start with something easy: JW Player in SWFObject
1380 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1382 # Broaden the search a little bit
1383 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1385 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1388 # It's possible that one of the regexes
1389 # matched, but returned an empty group:
1390 if mobj.group(1) is None:
1391 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1394 video_url = compat_urllib_parse.unquote(mobj.group(1))
1395 video_id = os.path.basename(video_url)
1397 # here's a fun little line of code for you:
1398 video_extension = os.path.splitext(video_id)[1][1:]
1399 video_id = os.path.splitext(video_id)[0]
1401 # it's tempting to parse this further, but you would
1402 # have to take into account all the variations like
1403 # Video Title - Site Name
1404 # Site Name | Video Title
1405 # Video Title - Tagline | Site Name
1406 # and so on and so forth; it's just not practical
1407 mobj = re.search(r'<title>(.*)</title>', webpage)
1409 self._downloader.trouble(u'ERROR: unable to extract title')
1411 video_title = mobj.group(1).decode('utf-8')
1413 # video uploader is domain name
1414 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416 self._downloader.trouble(u'ERROR: unable to extract title')
1418 video_uploader = mobj.group(1).decode('utf-8')
1421 'id': video_id.decode('utf-8'),
1422 'url': video_url.decode('utf-8'),
1423 'uploader': video_uploader,
1424 'upload_date': None,
1425 'title': video_title,
1426 'ext': video_extension.decode('utf-8'),
1430 class YoutubeSearchIE(InfoExtractor):
1431 """Information Extractor for YouTube search queries."""
1432 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1433 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1434 _max_youtube_results = 1000
1435 IE_NAME = u'youtube:search'
1437 def __init__(self, downloader=None):
1438 InfoExtractor.__init__(self, downloader)
1440 def report_download_page(self, query, pagenum):
1441 """Report attempt to download search page with given number."""
1442 query = query.decode(preferredencoding())
1443 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1445 def _real_extract(self, query):
1446 mobj = re.match(self._VALID_URL, query)
1448 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1451 prefix, query = query.split(':')
1453 query = query.encode('utf-8')
1455 self._download_n_results(query, 1)
1457 elif prefix == 'all':
1458 self._download_n_results(query, self._max_youtube_results)
1464 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1466 elif n > self._max_youtube_results:
1467 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1468 n = self._max_youtube_results
1469 self._download_n_results(query, n)
1471 except ValueError: # parsing prefix as integer fails
1472 self._download_n_results(query, 1)
1475 def _download_n_results(self, query, n):
1476 """Downloads a specified number of results for a query"""
1482 while (50 * pagenum) < limit:
1483 self.report_download_page(query, pagenum+1)
1484 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1485 request = compat_urllib_request.Request(result_url)
1487 data = compat_urllib_request.urlopen(request).read()
1488 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1489 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1491 api_response = json.loads(data)['data']
1493 new_ids = list(video['id'] for video in api_response['items'])
1494 video_ids += new_ids
1496 limit = min(n, api_response['totalItems'])
1499 if len(video_ids) > n:
1500 video_ids = video_ids[:n]
1501 for id in video_ids:
1502 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1506 class GoogleSearchIE(InfoExtractor):
1507 """Information Extractor for Google Video search queries."""
1508 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1509 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1510 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1511 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1512 _max_google_results = 1000
1513 IE_NAME = u'video.google:search'
1515 def __init__(self, downloader=None):
1516 InfoExtractor.__init__(self, downloader)
1518 def report_download_page(self, query, pagenum):
1519 """Report attempt to download playlist page with given number."""
1520 query = query.decode(preferredencoding())
1521 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1523 def _real_extract(self, query):
1524 mobj = re.match(self._VALID_URL, query)
1526 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1529 prefix, query = query.split(':')
1531 query = query.encode('utf-8')
1533 self._download_n_results(query, 1)
1535 elif prefix == 'all':
1536 self._download_n_results(query, self._max_google_results)
1542 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1544 elif n > self._max_google_results:
1545 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1546 n = self._max_google_results
1547 self._download_n_results(query, n)
1549 except ValueError: # parsing prefix as integer fails
1550 self._download_n_results(query, 1)
1553 def _download_n_results(self, query, n):
1554 """Downloads a specified number of results for a query"""
1560 self.report_download_page(query, pagenum)
1561 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1562 request = compat_urllib_request.Request(result_url)
1564 page = compat_urllib_request.urlopen(request).read()
1565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1566 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1569 # Extract video identifiers
1570 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1571 video_id = mobj.group(1)
1572 if video_id not in video_ids:
1573 video_ids.append(video_id)
1574 if len(video_ids) == n:
1575 # Specified n videos reached
1576 for id in video_ids:
1577 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1580 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1581 for id in video_ids:
1582 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1585 pagenum = pagenum + 1
1588 class YahooSearchIE(InfoExtractor):
1589 """Information Extractor for Yahoo! Video search queries."""
1590 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1591 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1592 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1593 _MORE_PAGES_INDICATOR = r'\s*Next'
1594 _max_yahoo_results = 1000
1595 IE_NAME = u'video.yahoo:search'
1597 def __init__(self, downloader=None):
1598 InfoExtractor.__init__(self, downloader)
1600 def report_download_page(self, query, pagenum):
1601 """Report attempt to download playlist page with given number."""
1602 query = query.decode(preferredencoding())
1603 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1605 def _real_extract(self, query):
1606 mobj = re.match(self._VALID_URL, query)
1608 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1611 prefix, query = query.split(':')
1613 query = query.encode('utf-8')
1615 self._download_n_results(query, 1)
1617 elif prefix == 'all':
1618 self._download_n_results(query, self._max_yahoo_results)
1624 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1626 elif n > self._max_yahoo_results:
1627 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1628 n = self._max_yahoo_results
1629 self._download_n_results(query, n)
1631 except ValueError: # parsing prefix as integer fails
1632 self._download_n_results(query, 1)
1635 def _download_n_results(self, query, n):
1636 """Downloads a specified number of results for a query"""
1639 already_seen = set()
1643 self.report_download_page(query, pagenum)
1644 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1645 request = compat_urllib_request.Request(result_url)
1647 page = compat_urllib_request.urlopen(request).read()
1648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1652 # Extract video identifiers
1653 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1654 video_id = mobj.group(1)
1655 if video_id not in already_seen:
1656 video_ids.append(video_id)
1657 already_seen.add(video_id)
1658 if len(video_ids) == n:
1659 # Specified n videos reached
1660 for id in video_ids:
1661 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1664 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1665 for id in video_ids:
1666 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1669 pagenum = pagenum + 1
1672 class YoutubePlaylistIE(InfoExtractor):
1673 """Information Extractor for YouTube playlists."""
1675 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1676 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1677 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1678 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1679 IE_NAME = u'youtube:playlist'
1681 def __init__(self, downloader=None):
1682 InfoExtractor.__init__(self, downloader)
1684 def report_download_page(self, playlist_id, pagenum):
1685 """Report attempt to download playlist page with given number."""
1686 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1688 def _real_extract(self, url):
1689 # Extract playlist id
1690 mobj = re.match(self._VALID_URL, url)
1692 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1696 if mobj.group(3) is not None:
1697 self._downloader.download([mobj.group(3)])
1700 # Download playlist pages
1701 # prefix is 'p' as default for playlists but there are other types that need extra care
1702 playlist_prefix = mobj.group(1)
1703 if playlist_prefix == 'a':
1704 playlist_access = 'artist'
1706 playlist_prefix = 'p'
1707 playlist_access = 'view_play_list'
1708 playlist_id = mobj.group(2)
1713 self.report_download_page(playlist_id, pagenum)
1714 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1715 request = compat_urllib_request.Request(url)
1717 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1719 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1722 # Extract video identifiers
1724 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1725 if mobj.group(1) not in ids_in_page:
1726 ids_in_page.append(mobj.group(1))
1727 video_ids.extend(ids_in_page)
1729 if self._MORE_PAGES_INDICATOR not in page:
1731 pagenum = pagenum + 1
1733 total = len(video_ids)
1735 playliststart = self._downloader.params.get('playliststart', 1) - 1
1736 playlistend = self._downloader.params.get('playlistend', -1)
1737 if playlistend == -1:
1738 video_ids = video_ids[playliststart:]
1740 video_ids = video_ids[playliststart:playlistend]
1742 if len(video_ids) == total:
1743 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1745 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1747 for id in video_ids:
1748 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1752 class YoutubeChannelIE(InfoExtractor):
1753 """Information Extractor for YouTube channels."""
1755 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1756 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1757 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1758 IE_NAME = u'youtube:channel'
1760 def report_download_page(self, channel_id, pagenum):
1761 """Report attempt to download channel page with given number."""
1762 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1764 def _real_extract(self, url):
1765 # Extract channel id
1766 mobj = re.match(self._VALID_URL, url)
1768 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1771 # Download channel pages
1772 channel_id = mobj.group(1)
1777 self.report_download_page(channel_id, pagenum)
1778 url = self._TEMPLATE_URL % (channel_id, pagenum)
1779 request = compat_urllib_request.Request(url)
1781 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1786 # Extract video identifiers
1788 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1789 if mobj.group(1) not in ids_in_page:
1790 ids_in_page.append(mobj.group(1))
1791 video_ids.extend(ids_in_page)
1793 if self._MORE_PAGES_INDICATOR not in page:
1795 pagenum = pagenum + 1
1797 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1799 for id in video_ids:
1800 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1804 class YoutubeUserIE(InfoExtractor):
1805 """Information Extractor for YouTube users."""
1807 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1808 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1809 _GDATA_PAGE_SIZE = 50
1810 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1811 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1812 IE_NAME = u'youtube:user'
1814 def __init__(self, downloader=None):
1815 InfoExtractor.__init__(self, downloader)
1817 def report_download_page(self, username, start_index):
1818 """Report attempt to download user page."""
1819 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1820 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1822 def _real_extract(self, url):
1824 mobj = re.match(self._VALID_URL, url)
1826 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1829 username = mobj.group(1)
1831 # Download video ids using YouTube Data API. Result size per
1832 # query is limited (currently to 50 videos) so we need to query
1833 # page by page until there are no video ids - it means we got
1840 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1841 self.report_download_page(username, start_index)
1843 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1846 page = compat_urllib_request.urlopen(request).read()
1847 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1851 # Extract video identifiers
1854 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1855 if mobj.group(1) not in ids_in_page:
1856 ids_in_page.append(mobj.group(1))
1858 video_ids.extend(ids_in_page)
1860 # A little optimization - if current page is not
1861 # "full", ie. does not contain PAGE_SIZE video ids then
1862 # we can assume that this page is the last one - there
1863 # are no more ids on further pages - no need to query
1866 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1871 all_ids_count = len(video_ids)
1872 playliststart = self._downloader.params.get('playliststart', 1) - 1
1873 playlistend = self._downloader.params.get('playlistend', -1)
1875 if playlistend == -1:
1876 video_ids = video_ids[playliststart:]
1878 video_ids = video_ids[playliststart:playlistend]
1880 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1881 (username, all_ids_count, len(video_ids)))
1883 for video_id in video_ids:
1884 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1887 class BlipTVUserIE(InfoExtractor):
1888 """Information Extractor for blip.tv users."""
1890 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1892 IE_NAME = u'blip.tv:user'
1894 def __init__(self, downloader=None):
1895 InfoExtractor.__init__(self, downloader)
1897 def report_download_page(self, username, pagenum):
1898 """Report attempt to download user page."""
1899 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1900 (self.IE_NAME, username, pagenum))
1902 def _real_extract(self, url):
1904 mobj = re.match(self._VALID_URL, url)
1906 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1909 username = mobj.group(1)
1911 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1913 request = compat_urllib_request.Request(url)
1916 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1917 mobj = re.search(r'data-users-id="([^"]+)"', page)
1918 page_base = page_base % mobj.group(1)
1919 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1920 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1924 # Download video ids using BlipTV Ajax calls. Result size per
1925 # query is limited (currently to 12 videos) so we need to query
1926 # page by page until there are no video ids - it means we got
1933 self.report_download_page(username, pagenum)
1935 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1938 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1939 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1940 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1943 # Extract video identifiers
1946 for mobj in re.finditer(r'href="/([^"]+)"', page):
1947 if mobj.group(1) not in ids_in_page:
1948 ids_in_page.append(unescapeHTML(mobj.group(1)))
1950 video_ids.extend(ids_in_page)
1952 # A little optimization - if current page is not
1953 # "full", ie. does not contain PAGE_SIZE video ids then
1954 # we can assume that this page is the last one - there
1955 # are no more ids on further pages - no need to query
1958 if len(ids_in_page) < self._PAGE_SIZE:
1963 all_ids_count = len(video_ids)
1964 playliststart = self._downloader.params.get('playliststart', 1) - 1
1965 playlistend = self._downloader.params.get('playlistend', -1)
1967 if playlistend == -1:
1968 video_ids = video_ids[playliststart:]
1970 video_ids = video_ids[playliststart:playlistend]
1972 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1973 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1975 for video_id in video_ids:
1976 self._downloader.download([u'http://blip.tv/'+video_id])
1979 class DepositFilesIE(InfoExtractor):
1980 """Information extractor for depositfiles.com"""
1982 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1983 IE_NAME = u'DepositFiles'
1985 def __init__(self, downloader=None):
1986 InfoExtractor.__init__(self, downloader)
1988 def report_download_webpage(self, file_id):
1989 """Report webpage download."""
1990 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1992 def report_extraction(self, file_id):
1993 """Report information extraction."""
1994 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1996 def _real_extract(self, url):
1997 file_id = url.split('/')[-1]
1998 # Rebuild url in english locale
1999 url = 'http://depositfiles.com/en/files/' + file_id
2001 # Retrieve file webpage with 'Free download' button pressed
2002 free_download_indication = { 'gateway_result' : '1' }
2003 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2005 self.report_download_webpage(file_id)
2006 webpage = compat_urllib_request.urlopen(request).read()
2007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2011 # Search for the real file URL
2012 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2013 if (mobj is None) or (mobj.group(1) is None):
2014 # Try to figure out reason of the error.
2015 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2016 if (mobj is not None) and (mobj.group(1) is not None):
2017 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2018 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2020 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2023 file_url = mobj.group(1)
2024 file_extension = os.path.splitext(file_url)[1][1:]
2026 # Search for file title
2027 mobj = re.search(r'<b title="(.*?)">', webpage)
2029 self._downloader.trouble(u'ERROR: unable to extract title')
2031 file_title = mobj.group(1).decode('utf-8')
2034 'id': file_id.decode('utf-8'),
2035 'url': file_url.decode('utf-8'),
2037 'upload_date': None,
2038 'title': file_title,
2039 'ext': file_extension.decode('utf-8'),
2043 class FacebookIE(InfoExtractor):
2044 """Information Extractor for Facebook"""
2047 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2048 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2049 _NETRC_MACHINE = 'facebook'
2050 _available_formats = ['video', 'highqual', 'lowqual']
2051 _video_extensions = {
2056 IE_NAME = u'facebook'
2058 def __init__(self, downloader=None):
2059 InfoExtractor.__init__(self, downloader)
2061 def _reporter(self, message):
2062 """Add header and report message."""
2063 self._downloader.to_screen(u'[facebook] %s' % message)
2065 def report_login(self):
2066 """Report attempt to log in."""
2067 self._reporter(u'Logging in')
2069 def report_video_webpage_download(self, video_id):
2070 """Report attempt to download video webpage."""
2071 self._reporter(u'%s: Downloading video webpage' % video_id)
2073 def report_information_extraction(self, video_id):
2074 """Report attempt to extract video information."""
2075 self._reporter(u'%s: Extracting video information' % video_id)
2077 def _parse_page(self, video_webpage):
2078 """Extract video information from page"""
2080 data = {'title': r'\("video_title", "(.*?)"\)',
2081 'description': r'<div class="datawrap">(.*?)</div>',
2082 'owner': r'\("video_owner_name", "(.*?)"\)',
2083 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2086 for piece in data.keys():
2087 mobj = re.search(data[piece], video_webpage)
2088 if mobj is not None:
2089 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2093 for fmt in self._available_formats:
2094 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2095 if mobj is not None:
2096 # URL is in a Javascript segment inside an escaped Unicode format within
2097 # the generally utf-8 page
2098 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2099 video_info['video_urls'] = video_urls
2103 def _real_initialize(self):
2104 if self._downloader is None:
2109 downloader_params = self._downloader.params
2111 # Attempt to use provided username and password or .netrc data
2112 if downloader_params.get('username', None) is not None:
2113 useremail = downloader_params['username']
2114 password = downloader_params['password']
2115 elif downloader_params.get('usenetrc', False):
2117 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2118 if info is not None:
2122 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2123 except (IOError, netrc.NetrcParseError) as err:
2124 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2127 if useremail is None:
2136 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2139 login_results = compat_urllib_request.urlopen(request).read()
2140 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2141 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2143 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2144 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2147 def _real_extract(self, url):
2148 mobj = re.match(self._VALID_URL, url)
2150 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2152 video_id = mobj.group('ID')
2155 self.report_video_webpage_download(video_id)
2156 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2158 page = compat_urllib_request.urlopen(request)
2159 video_webpage = page.read()
2160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2161 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2164 # Start extracting information
2165 self.report_information_extraction(video_id)
2167 # Extract information
2168 video_info = self._parse_page(video_webpage)
2171 if 'owner' not in video_info:
2172 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2174 video_uploader = video_info['owner']
2177 if 'title' not in video_info:
2178 self._downloader.trouble(u'ERROR: unable to extract video title')
2180 video_title = video_info['title']
2181 video_title = video_title.decode('utf-8')
2184 if 'thumbnail' not in video_info:
2185 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2186 video_thumbnail = ''
2188 video_thumbnail = video_info['thumbnail']
2192 if 'upload_date' in video_info:
2193 upload_time = video_info['upload_date']
2194 timetuple = email.utils.parsedate_tz(upload_time)
2195 if timetuple is not None:
2197 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2202 video_description = video_info.get('description', 'No description available.')
2204 url_map = video_info['video_urls']
2205 if len(url_map.keys()) > 0:
2206 # Decide which formats to download
2207 req_format = self._downloader.params.get('format', None)
2208 format_limit = self._downloader.params.get('format_limit', None)
2210 if format_limit is not None and format_limit in self._available_formats:
2211 format_list = self._available_formats[self._available_formats.index(format_limit):]
2213 format_list = self._available_formats
2214 existing_formats = [x for x in format_list if x in url_map]
2215 if len(existing_formats) == 0:
2216 self._downloader.trouble(u'ERROR: no known formats available for video')
2218 if req_format is None:
2219 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2220 elif req_format == 'worst':
2221 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2222 elif req_format == '-1':
2223 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2226 if req_format not in url_map:
2227 self._downloader.trouble(u'ERROR: requested format not available')
2229 video_url_list = [(req_format, url_map[req_format])] # Specific format
2232 for format_param, video_real_url in video_url_list:
2234 video_extension = self._video_extensions.get(format_param, 'mp4')
2237 'id': video_id.decode('utf-8'),
2238 'url': video_real_url.decode('utf-8'),
2239 'uploader': video_uploader.decode('utf-8'),
2240 'upload_date': upload_date,
2241 'title': video_title,
2242 'ext': video_extension.decode('utf-8'),
2243 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2244 'thumbnail': video_thumbnail.decode('utf-8'),
2245 'description': video_description.decode('utf-8'),
2249 class BlipTVIE(InfoExtractor):
2250 """Information extractor for blip.tv"""
2252 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2253 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2254 IE_NAME = u'blip.tv'
2256 def report_extraction(self, file_id):
2257 """Report information extraction."""
2258 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2260 def report_direct_download(self, title):
2261 """Report information extraction."""
2262 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2264 def _real_extract(self, url):
2265 mobj = re.match(self._VALID_URL, url)
2267 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2274 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2275 request = compat_urllib_request.Request(json_url)
2276 self.report_extraction(mobj.group(1))
2279 urlh = compat_urllib_request.urlopen(request)
2280 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2281 basename = url.split('/')[-1]
2282 title,ext = os.path.splitext(basename)
2283 title = title.decode('UTF-8')
2284 ext = ext.replace('.', '')
2285 self.report_direct_download(title)
2290 'upload_date': None,
2295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2296 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2298 if info is None: # Regular URL
2300 json_code_bytes = urlh.read()
2301 json_code = json_code_bytes.decode('utf-8')
2302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2303 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2307 json_data = json.loads(json_code)
2308 if 'Post' in json_data:
2309 data = json_data['Post']
2313 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2314 video_url = data['media']['url']
2315 umobj = re.match(self._URL_EXT, video_url)
2317 raise ValueError('Can not determine filename extension')
2318 ext = umobj.group(1)
2321 'id': data['item_id'],
2323 'uploader': data['display_name'],
2324 'upload_date': upload_date,
2325 'title': data['title'],
2327 'format': data['media']['mimeType'],
2328 'thumbnail': data['thumbnailUrl'],
2329 'description': data['description'],
2330 'player_url': data['embedUrl']
2332 except (ValueError,KeyError) as err:
2333 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2336 std_headers['User-Agent'] = 'iTunes/10.6.1'
2340 class MyVideoIE(InfoExtractor):
2341 """Information Extractor for myvideo.de."""
2343 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2344 IE_NAME = u'myvideo'
2346 def __init__(self, downloader=None):
2347 InfoExtractor.__init__(self, downloader)
2349 def report_download_webpage(self, video_id):
2350 """Report webpage download."""
2351 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2353 def report_extraction(self, video_id):
2354 """Report information extraction."""
2355 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2357 def _real_extract(self,url):
2358 mobj = re.match(self._VALID_URL, url)
2360 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2363 video_id = mobj.group(1)
2366 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2368 self.report_download_webpage(video_id)
2369 webpage = compat_urllib_request.urlopen(request).read()
2370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2371 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2374 self.report_extraction(video_id)
2375 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2378 self._downloader.trouble(u'ERROR: unable to extract media URL')
2380 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2382 mobj = re.search('<title>([^<]+)</title>', webpage)
2384 self._downloader.trouble(u'ERROR: unable to extract title')
2387 video_title = mobj.group(1)
2393 'upload_date': None,
2394 'title': video_title,
2398 class ComedyCentralIE(InfoExtractor):
2399 """Information extractor for The Daily Show and Colbert Report """
2401 # urls can be abbreviations like :thedailyshow or :colbert
2402 # urls for episodes like:
2403 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2404 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2405 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2406 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2407 |(https?://)?(www\.)?
2408 (?P<showname>thedailyshow|colbertnation)\.com/
2409 (full-episodes/(?P<episode>.*)|
2411 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2412 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2414 IE_NAME = u'comedycentral'
2416 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2418 _video_extensions = {
2426 _video_dimensions = {
2435 def suitable(self, url):
2436 """Receives a URL and returns True if suitable for this IE."""
2437 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2439 def report_extraction(self, episode_id):
2440 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2442 def report_config_download(self, episode_id):
2443 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2445 def report_index_download(self, episode_id):
2446 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2448 def report_player_url(self, episode_id):
2449 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2452 def _print_formats(self, formats):
2453 print('Available formats:')
2455 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2458 def _real_extract(self, url):
2459 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2461 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2464 if mobj.group('shortname'):
2465 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2466 url = u'http://www.thedailyshow.com/full-episodes/'
2468 url = u'http://www.colbertnation.com/full-episodes/'
2469 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2470 assert mobj is not None
2472 if mobj.group('clip'):
2473 if mobj.group('showname') == 'thedailyshow':
2474 epTitle = mobj.group('tdstitle')
2476 epTitle = mobj.group('cntitle')
2479 dlNewest = not mobj.group('episode')
2481 epTitle = mobj.group('showname')
2483 epTitle = mobj.group('episode')
2485 req = compat_urllib_request.Request(url)
2486 self.report_extraction(epTitle)
2488 htmlHandle = compat_urllib_request.urlopen(req)
2489 html = htmlHandle.read()
2490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2494 url = htmlHandle.geturl()
2495 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2497 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2499 if mobj.group('episode') == '':
2500 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2502 epTitle = mobj.group('episode')
2504 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2506 if len(mMovieParams) == 0:
2507 # The Colbert Report embeds the information in a without
2508 # a URL prefix; so extract the alternate reference
2509 # and then add the URL prefix manually.
2511 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2512 if len(altMovieParams) == 0:
2513 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2516 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2518 playerUrl_raw = mMovieParams[0][0]
2519 self.report_player_url(epTitle)
2521 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2522 playerUrl = urlHandle.geturl()
2523 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2524 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2527 uri = mMovieParams[0][1]
2528 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2529 self.report_index_download(epTitle)
2531 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2532 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2538 idoc = xml.etree.ElementTree.fromstring(indexXml)
2539 itemEls = idoc.findall('.//item')
2540 for itemEl in itemEls:
2541 mediaId = itemEl.findall('./guid')[0].text
2542 shortMediaId = mediaId.split(':')[-1]
2543 showId = mediaId.split(':')[-2].replace('.com', '')
2544 officialTitle = itemEl.findall('./title')[0].text
2545 officialDate = itemEl.findall('./pubDate')[0].text
2547 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2548 compat_urllib_parse.urlencode({'uri': mediaId}))
2549 configReq = compat_urllib_request.Request(configUrl)
2550 self.report_config_download(epTitle)
2552 configXml = compat_urllib_request.urlopen(configReq).read()
2553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2557 cdoc = xml.etree.ElementTree.fromstring(configXml)
2559 for rendition in cdoc.findall('.//rendition'):
2560 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2564 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2567 if self._downloader.params.get('listformats', None):
2568 self._print_formats([i[0] for i in turls])
2571 # For now, just pick the highest bitrate
2572 format,video_url = turls[-1]
2574 # Get the format arg from the arg stream
2575 req_format = self._downloader.params.get('format', None)
2577 # Select format if we can find one
2580 format, video_url = f, v
2583 # Patch to download from alternative CDN, which does not
2584 # break on current RTMPDump builds
2585 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2586 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2588 if video_url.startswith(broken_cdn):
2589 video_url = video_url.replace(broken_cdn, better_cdn)
2591 effTitle = showId + u'-' + epTitle
2596 'upload_date': officialDate,
2601 'description': officialTitle,
2602 'player_url': None #playerUrl
2605 results.append(info)
2610 class EscapistIE(InfoExtractor):
2611 """Information extractor for The Escapist """
2613 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2614 IE_NAME = u'escapist'
2616 def report_extraction(self, showName):
2617 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2619 def report_config_download(self, showName):
2620 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2622 def _real_extract(self, url):
2623 mobj = re.match(self._VALID_URL, url)
2625 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2627 showName = mobj.group('showname')
2628 videoId = mobj.group('episode')
2630 self.report_extraction(showName)
2632 webPage = compat_urllib_request.urlopen(url)
2633 webPageBytes = webPage.read()
2634 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2635 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2636 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2637 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2640 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2641 description = unescapeHTML(descMatch.group(1))
2642 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2643 imgUrl = unescapeHTML(imgMatch.group(1))
2644 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2645 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2646 configUrlMatch = re.search('config=(.*)$', playerUrl)
2647 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2649 self.report_config_download(showName)
2651 configJSON = compat_urllib_request.urlopen(configUrl).read()
2652 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2653 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2656 # Technically, it's JavaScript, not JSON
2657 configJSON = configJSON.replace("'", '"')
2660 config = json.loads(configJSON)
2661 except (ValueError,) as err:
2662 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2665 playlist = config['playlist']
2666 videoUrl = playlist[1]['url']
2671 'uploader': showName,
2672 'upload_date': None,
2675 'thumbnail': imgUrl,
2676 'description': description,
2677 'player_url': playerUrl,
2683 class CollegeHumorIE(InfoExtractor):
2684 """Information extractor for collegehumor.com"""
2687 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2688 IE_NAME = u'collegehumor'
2690 def report_manifest(self, video_id):
2691 """Report information extraction."""
2692 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2694 def report_extraction(self, video_id):
2695 """Report information extraction."""
2696 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698 def _real_extract(self, url):
2699 mobj = re.match(self._VALID_URL, url)
2701 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703 video_id = mobj.group('videoid')
2708 'upload_date': None,
2711 self.report_extraction(video_id)
2712 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2714 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2716 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2719 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2721 videoNode = mdoc.findall('./video')[0]
2722 info['description'] = videoNode.findall('./description')[0].text
2723 info['title'] = videoNode.findall('./caption')[0].text
2724 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2725 manifest_url = videoNode.findall('./file')[0].text
2727 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2730 manifest_url += '?hdcore=2.10.3'
2731 self.report_manifest(video_id)
2733 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2734 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2735 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2738 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2740 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2741 node_id = media_node.attrib['url']
2742 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2743 except IndexError as err:
2744 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2747 url_pr = compat_urllib_parse_urlparse(manifest_url)
2748 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2755 class XVideosIE(InfoExtractor):
2756 """Information extractor for xvideos.com"""
2758 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2759 IE_NAME = u'xvideos'
2761 def report_webpage(self, video_id):
2762 """Report information extraction."""
2763 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2765 def report_extraction(self, video_id):
2766 """Report information extraction."""
2767 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2769 def _real_extract(self, url):
2770 mobj = re.match(self._VALID_URL, url)
2772 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2774 video_id = mobj.group(1)
2776 self.report_webpage(video_id)
2778 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2780 webpage_bytes = compat_urllib_request.urlopen(request).read()
2781 webpage = webpage_bytes.decode('utf-8', 'replace')
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2786 self.report_extraction(video_id)
2790 mobj = re.search(r'flv_url=(.+?)&', webpage)
2792 self._downloader.trouble(u'ERROR: unable to extract video url')
2794 video_url = compat_urllib_parse.unquote(mobj.group(1))
2798 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2800 self._downloader.trouble(u'ERROR: unable to extract video title')
2802 video_title = mobj.group(1)
2805 # Extract video thumbnail
2806 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2808 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2810 video_thumbnail = mobj.group(0)
2816 'upload_date': None,
2817 'title': video_title,
2819 'thumbnail': video_thumbnail,
2820 'description': None,
2826 class SoundcloudIE(InfoExtractor):
2827 """Information extractor for soundcloud.com
2828 To access the media, the uid of the song and a stream token
2829 must be extracted from the page source and the script must make
2830 a request to media.soundcloud.com/crossdomain.xml. Then
2831 the media can be grabbed by requesting from an url composed
2832 of the stream token and uid
2835 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2836 IE_NAME = u'soundcloud'
2838 def __init__(self, downloader=None):
2839 InfoExtractor.__init__(self, downloader)
2841 def report_resolve(self, video_id):
2842 """Report information extraction."""
2843 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2845 def report_extraction(self, video_id):
2846 """Report information extraction."""
2847 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2849 def _real_extract(self, url):
2850 mobj = re.match(self._VALID_URL, url)
2852 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2855 # extract uploader (which is in the url)
2856 uploader = mobj.group(1)
2857 # extract simple title (uploader + slug of song title)
2858 slug_title = mobj.group(2)
2859 simple_title = uploader + u'-' + slug_title
2861 self.report_resolve('%s/%s' % (uploader, slug_title))
2863 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2864 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2865 request = compat_urllib_request.Request(resolv_url)
2867 info_json_bytes = compat_urllib_request.urlopen(request).read()
2868 info_json = info_json_bytes.decode('utf-8')
2869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2873 info = json.loads(info_json)
2874 video_id = info['id']
2875 self.report_extraction('%s/%s' % (uploader, slug_title))
2877 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2878 request = compat_urllib_request.Request(streams_url)
2880 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2881 stream_json = stream_json_bytes.decode('utf-8')
2882 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2883 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2886 streams = json.loads(stream_json)
2887 mediaURL = streams['http_mp3_128_url']
2892 'uploader': info['user']['username'],
2893 'upload_date': info['created_at'],
2894 'title': info['title'],
2896 'description': info['description'],
2900 class InfoQIE(InfoExtractor):
2901 """Information extractor for infoq.com"""
2903 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2906 def report_webpage(self, video_id):
2907 """Report information extraction."""
2908 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2910 def report_extraction(self, video_id):
2911 """Report information extraction."""
2912 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2914 def _real_extract(self, url):
2915 mobj = re.match(self._VALID_URL, url)
2917 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2920 self.report_webpage(url)
2922 request = compat_urllib_request.Request(url)
2924 webpage = compat_urllib_request.urlopen(request).read()
2925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2929 self.report_extraction(url)
2933 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2935 self._downloader.trouble(u'ERROR: unable to extract video url')
2937 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2941 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2943 self._downloader.trouble(u'ERROR: unable to extract video title')
2945 video_title = mobj.group(1).decode('utf-8')
2947 # Extract description
2948 video_description = u'No description available.'
2949 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2950 if mobj is not None:
2951 video_description = mobj.group(1).decode('utf-8')
2953 video_filename = video_url.split('/')[-1]
2954 video_id, extension = video_filename.split('.')
2960 'upload_date': None,
2961 'title': video_title,
2962 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2964 'description': video_description,
2969 class MixcloudIE(InfoExtractor):
2970 """Information extractor for www.mixcloud.com"""
2971 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2972 IE_NAME = u'mixcloud'
2974 def __init__(self, downloader=None):
2975 InfoExtractor.__init__(self, downloader)
2977 def report_download_json(self, file_id):
2978 """Report JSON download."""
2979 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2981 def report_extraction(self, file_id):
2982 """Report information extraction."""
2983 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2985 def get_urls(self, jsonData, fmt, bitrate='best'):
2986 """Get urls from 'audio_formats' section in json"""
2989 bitrate_list = jsonData[fmt]
2990 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2991 bitrate = max(bitrate_list) # select highest
2993 url_list = jsonData[fmt][bitrate]
2994 except TypeError: # we have no bitrate info.
2995 url_list = jsonData[fmt]
2998 def check_urls(self, url_list):
2999 """Returns 1st active url from list"""
3000 for url in url_list:
3002 compat_urllib_request.urlopen(url)
3004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009 def _print_formats(self, formats):
3010 print('Available formats:')
3011 for fmt in formats.keys():
3012 for b in formats[fmt]:
3014 ext = formats[fmt][b][0]
3015 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3016 except TypeError: # we have no bitrate info
3017 ext = formats[fmt][0]
3018 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3021 def _real_extract(self, url):
3022 mobj = re.match(self._VALID_URL, url)
3024 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3026 # extract uploader & filename from url
3027 uploader = mobj.group(1).decode('utf-8')
3028 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3030 # construct API request
3031 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3032 # retrieve .json file with links to files
3033 request = compat_urllib_request.Request(file_url)
3035 self.report_download_json(file_url)
3036 jsonData = compat_urllib_request.urlopen(request).read()
3037 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3038 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3042 json_data = json.loads(jsonData)
3043 player_url = json_data['player_swf_url']
3044 formats = dict(json_data['audio_formats'])
3046 req_format = self._downloader.params.get('format', None)
3049 if self._downloader.params.get('listformats', None):
3050 self._print_formats(formats)
3053 if req_format is None or req_format == 'best':
3054 for format_param in formats.keys():
3055 url_list = self.get_urls(formats, format_param)
3057 file_url = self.check_urls(url_list)
3058 if file_url is not None:
3061 if req_format not in formats.keys():
3062 self._downloader.trouble(u'ERROR: format is not available')
3065 url_list = self.get_urls(formats, req_format)
3066 file_url = self.check_urls(url_list)
3067 format_param = req_format
3070 'id': file_id.decode('utf-8'),
3071 'url': file_url.decode('utf-8'),
3072 'uploader': uploader.decode('utf-8'),
3073 'upload_date': None,
3074 'title': json_data['name'],
3075 'ext': file_url.split('.')[-1].decode('utf-8'),
3076 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3077 'thumbnail': json_data['thumbnail_url'],
3078 'description': json_data['description'],
3079 'player_url': player_url.decode('utf-8'),
3082 class StanfordOpenClassroomIE(InfoExtractor):
3083 """Information extractor for Stanford's Open ClassRoom"""
3085 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3086 IE_NAME = u'stanfordoc'
3088 def report_download_webpage(self, objid):
3089 """Report information extraction."""
3090 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3092 def report_extraction(self, video_id):
3093 """Report information extraction."""
3094 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3096 def _real_extract(self, url):
3097 mobj = re.match(self._VALID_URL, url)
3099 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3102 if mobj.group('course') and mobj.group('video'): # A specific video
3103 course = mobj.group('course')
3104 video = mobj.group('video')
3106 'id': course + '_' + video,
3108 'upload_date': None,
3111 self.report_extraction(info['id'])
3112 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3113 xmlUrl = baseUrl + video + '.xml'
3115 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3116 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3117 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3119 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3121 info['title'] = mdoc.findall('./title')[0].text
3122 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3124 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3126 info['ext'] = info['url'].rpartition('.')[2]
3128 elif mobj.group('course'): # A course page
3129 course = mobj.group('course')
3134 'upload_date': None,
3137 self.report_download_webpage(info['id'])
3139 coursepage = compat_urllib_request.urlopen(url).read()
3140 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3141 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3144 m = re.search('<h1>([^<]+)</h1>', coursepage)
3146 info['title'] = unescapeHTML(m.group(1))
3148 info['title'] = info['id']
3150 m = re.search('<description>([^<]+)</description>', coursepage)
3152 info['description'] = unescapeHTML(m.group(1))
3154 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3157 'type': 'reference',
3158 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3162 for entry in info['list']:
3163 assert entry['type'] == 'reference'
3164 results += self.extract(entry['url'])
3169 'id': 'Stanford OpenClassroom',
3172 'upload_date': None,
3175 self.report_download_webpage(info['id'])
3176 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3178 rootpage = compat_urllib_request.urlopen(rootURL).read()
3179 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3180 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3183 info['title'] = info['id']
3185 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3188 'type': 'reference',
3189 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3194 for entry in info['list']:
3195 assert entry['type'] == 'reference'
3196 results += self.extract(entry['url'])
3199 class MTVIE(InfoExtractor):
3200 """Information extractor for MTV.com"""
3202 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3205 def report_webpage(self, video_id):
3206 """Report information extraction."""
3207 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3209 def report_extraction(self, video_id):
3210 """Report information extraction."""
3211 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3213 def _real_extract(self, url):
3214 mobj = re.match(self._VALID_URL, url)
3216 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3218 if not mobj.group('proto'):
3219 url = 'http://' + url
3220 video_id = mobj.group('videoid')
3221 self.report_webpage(video_id)
3223 request = compat_urllib_request.Request(url)
3225 webpage = compat_urllib_request.urlopen(request).read()
3226 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3227 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3230 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3232 self._downloader.trouble(u'ERROR: unable to extract song name')
3234 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3235 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3237 self._downloader.trouble(u'ERROR: unable to extract performer')
3239 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3240 video_title = performer + ' - ' + song_name
3242 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3244 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3246 mtvn_uri = mobj.group(1)
3248 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3250 self._downloader.trouble(u'ERROR: unable to extract content id')
3252 content_id = mobj.group(1)
3254 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3255 self.report_extraction(video_id)
3256 request = compat_urllib_request.Request(videogen_url)
3258 metadataXml = compat_urllib_request.urlopen(request).read()
3259 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3260 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3263 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3264 renditions = mdoc.findall('.//rendition')
3266 # For now, always pick the highest quality.
3267 rendition = renditions[-1]
3270 _,_,ext = rendition.attrib['type'].partition('/')
3271 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3272 video_url = rendition.find('./src').text
3274 self._downloader.trouble('Invalid rendition field.')
3280 'uploader': performer,
3281 'upload_date': None,
3282 'title': video_title,
3290 class YoukuIE(InfoExtractor):
3292 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3295 def __init__(self, downloader=None):
3296 InfoExtractor.__init__(self, downloader)
3298 def report_download_webpage(self, file_id):
3299 """Report webpage download."""
3300 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3302 def report_extraction(self, file_id):
3303 """Report information extraction."""
3304 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3307 nowTime = int(time.time() * 1000)
3308 random1 = random.randint(1000,1998)
3309 random2 = random.randint(1000,9999)
3311 return "%d%d%d" %(nowTime,random1,random2)
3313 def _get_file_ID_mix_string(self, seed):
3315 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3317 for i in range(len(source)):
3318 seed = (seed * 211 + 30031 ) % 65536
3319 index = math.floor(seed / 65536 * len(source) )
3320 mixed.append(source[int(index)])
3321 source.remove(source[int(index)])
3322 #return ''.join(mixed)
3325 def _get_file_id(self, fileId, seed):
3326 mixed = self._get_file_ID_mix_string(seed)
3327 ids = fileId.split('*')
3331 realId.append(mixed[int(ch)])
3332 return ''.join(realId)
3334 def _real_extract(self, url):
3335 mobj = re.match(self._VALID_URL, url)
3337 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3339 video_id = mobj.group('ID')
3341 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3343 request = compat_urllib_request.Request(info_url, None, std_headers)
3345 self.report_download_webpage(video_id)
3346 jsondata = compat_urllib_request.urlopen(request).read()
3347 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3348 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3351 self.report_extraction(video_id)
3353 jsonstr = jsondata.decode('utf-8')
3354 config = json.loads(jsonstr)
3356 video_title = config['data'][0]['title']
3357 seed = config['data'][0]['seed']
3359 format = self._downloader.params.get('format', None)
3360 supported_format = config['data'][0]['streamfileids'].keys()
3362 if format is None or format == 'best':
3363 if 'hd2' in supported_format:
3368 elif format == 'worst':
3376 fileid = config['data'][0]['streamfileids'][format]
3377 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3378 except (UnicodeDecodeError, ValueError, KeyError):
3379 self._downloader.trouble(u'ERROR: unable to extract info section')
3383 sid = self._gen_sid()
3384 fileid = self._get_file_id(fileid, seed)
3386 #column 8,9 of fileid represent the segment number
3387 #fileid[7:9] should be changed
3388 for index, key in enumerate(keys):
3390 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3391 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3394 'id': '%s_part%02d' % (video_id, index),
3395 'url': download_url,
3397 'upload_date': None,
3398 'title': video_title,
3401 files_info.append(info)
3406 class XNXXIE(InfoExtractor):
3407 """Information extractor for xnxx.com"""
3409 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3411 VIDEO_URL_RE = r'flv_url=(.*?)&'
3412 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3413 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3415 def report_webpage(self, video_id):
3416 """Report information extraction"""
3417 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3419 def report_extraction(self, video_id):
3420 """Report information extraction"""
3421 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3423 def _real_extract(self, url):
3424 mobj = re.match(self._VALID_URL, url)
3426 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3428 video_id = mobj.group(1)
3430 self.report_webpage(video_id)
3432 # Get webpage content
3434 webpage_bytes = compat_urllib_request.urlopen(url).read()
3435 webpage = webpage_bytes.decode('utf-8')
3436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3437 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3440 result = re.search(self.VIDEO_URL_RE, webpage)
3442 self._downloader.trouble(u'ERROR: unable to extract video url')
3444 video_url = compat_urllib_parse.unquote(result.group(1))
3446 result = re.search(self.VIDEO_TITLE_RE, webpage)
3448 self._downloader.trouble(u'ERROR: unable to extract video title')
3450 video_title = result.group(1)
3452 result = re.search(self.VIDEO_THUMB_RE, webpage)
3454 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3456 video_thumbnail = result.group(1)
3462 'upload_date': None,
3463 'title': video_title,
3465 'thumbnail': video_thumbnail,
3466 'description': None,
3470 class GooglePlusIE(InfoExtractor):
3471 """Information extractor for plus.google.com."""
3473 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3474 IE_NAME = u'plus.google'
3476 def __init__(self, downloader=None):
3477 InfoExtractor.__init__(self, downloader)
3479 def report_extract_entry(self, url):
3480 """Report downloading extry"""
3481 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3483 def report_date(self, upload_date):
3484 """Report downloading extry"""
3485 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3487 def report_uploader(self, uploader):
3488 """Report downloading extry"""
3489 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3491 def report_title(self, video_title):
3492 """Report downloading extry"""
3493 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3495 def report_extract_vid_page(self, video_page):
3496 """Report information extraction."""
3497 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3499 def _real_extract(self, url):
3500 # Extract id from URL
3501 mobj = re.match(self._VALID_URL, url)
3503 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3506 post_url = mobj.group(0)
3507 video_id = mobj.group(2)
3509 video_extension = 'flv'
3511 # Step 1, Retrieve post webpage to extract further information
3512 self.report_extract_entry(post_url)
3513 request = compat_urllib_request.Request(post_url)
3515 webpage = compat_urllib_request.urlopen(request).read()
3516 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3517 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3520 # Extract update date
3522 pattern = 'title="Timestamp">(.*?)</a>'
3523 mobj = re.search(pattern, webpage)
3525 upload_date = mobj.group(1)
3526 # Convert timestring to a format suitable for filename
3527 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3528 upload_date = upload_date.strftime('%Y%m%d')
3529 self.report_date(upload_date)
3533 pattern = r'rel\="author".*?>(.*?)</a>'
3534 mobj = re.search(pattern, webpage)
3536 uploader = mobj.group(1)
3537 self.report_uploader(uploader)
3540 # Get the first line for title
3542 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3543 mobj = re.search(pattern, webpage)
3545 video_title = mobj.group(1)
3546 self.report_title(video_title)
3548 # Step 2, Stimulate clicking the image box to launch video
3549 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3550 mobj = re.search(pattern, webpage)
3552 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3554 video_page = mobj.group(1)
3555 request = compat_urllib_request.Request(video_page)
3557 webpage = compat_urllib_request.urlopen(request).read()
3558 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3559 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3561 self.report_extract_vid_page(video_page)
3564 # Extract video links on video page
3565 """Extract video links of all sizes"""
3566 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3567 mobj = re.findall(pattern, webpage)
3569 self._downloader.trouble(u'ERROR: unable to extract video links')
3571 # Sort in resolution
3572 links = sorted(mobj)
3574 # Choose the lowest of the sort, i.e. highest resolution
3575 video_url = links[-1]
3576 # Only get the url. The resolution part in the tuple has no use anymore
3577 video_url = video_url[-1]
3578 # Treat escaped \u0026 style hex
3579 video_url = unicode(video_url, "unicode_escape")
3583 'id': video_id.decode('utf-8'),
3585 'uploader': uploader.decode('utf-8'),
3586 'upload_date': upload_date.decode('utf-8'),
3587 'title': video_title.decode('utf-8'),
3588 'ext': video_extension.decode('utf-8'),
3591 class NBAIE(InfoExtractor):
3592 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3595 def report_extraction(self, video_id):
3596 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3598 def _real_extract(self, url):
3599 mobj = re.match(self._VALID_URL, url)
3601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3604 video_id = mobj.group(1)
3605 if video_id.endswith('/index.html'):
3606 video_id = video_id[:-len('/index.html')]
3608 self.report_extraction(video_id)
3610 urlh = compat_urllib_request.urlopen(url)
3611 webpage_bytes = urlh.read()
3612 webpage = webpage_bytes.decode('utf-8', 'ignore')
3613 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3614 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3617 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3618 def _findProp(rexp, default=None):
3619 m = re.search(rexp, webpage)
3621 return unescapeHTML(m.group(1))
3625 shortened_video_id = video_id.rpartition('/')[2]
3626 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3628 'id': shortened_video_id,
3632 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3633 'description': _findProp(r'<div class="description">(.*?)</h1>'),