2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
70 self.set_downloader(downloader)
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _print_formats(self, formats):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
227 def _real_initialize(self):
228 if self._downloader is None:
233 downloader_params = self._downloader.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
252 request = compat_urllib_request.Request(self._LANG_URL)
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
275 login_results = compat_urllib_request.urlopen(request).read()
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
286 'action_confirm': 'Confirm',
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read()
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
307 video_id = mobj.group(2)
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
354 # Start extracting information
355 self.report_information_extraction(video_id)
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
393 video_description = ''
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
402 srt_list = compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
419 srt_xml = compat_urllib_request.urlopen(request).read()
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
425 except Trouble as trouble:
426 self._downloader.trouble(trouble[0])
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
475 video_url_list = [(rf, url_map[rf])]
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
485 for format_param, video_real_url in video_url_list:
487 video_extension = self._video_extensions.get(format_param, 'flv')
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
549 'submit': "Continue - I'm over 18",
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
566 video_id = mobj.group(1)
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
616 self._downloader.trouble(u'ERROR: unable to extract title')
618 video_title = mobj.group(1).decode('utf-8')
620 mobj = re.search(r'submitter=(.*?);', webpage)
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
624 video_uploader = mobj.group(1)
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
662 video_extension = 'mp4'
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
668 self.report_download_webpage(video_id)
669 webpage_bytes = compat_urllib_request.urlopen(request).read()
670 webpage = webpage_bytes.decode('utf-8')
671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
675 # Extract URL, uploader and title from webpage
676 self.report_extraction(video_id)
677 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 flashvars = compat_urllib_parse.unquote(mobj.group(1))
683 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
686 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
692 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
694 self._downloader.trouble(u'ERROR: unable to extract video URL')
697 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
699 # TODO: support choosing qualities
701 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
703 self._downloader.trouble(u'ERROR: unable to extract title')
705 video_title = unescapeHTML(mobj.group('title'))
707 video_uploader = None
708 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
710 # lookin for official user
711 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712 if mobj_official is None:
713 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
715 video_uploader = mobj_official.group(1)
717 video_uploader = mobj.group(1)
719 video_upload_date = None
720 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
722 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
727 'uploader': video_uploader,
728 'upload_date': video_upload_date,
729 'title': video_title,
730 'ext': video_extension,
734 class GoogleIE(InfoExtractor):
735 """Information extractor for video.google.com."""
737 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
738 IE_NAME = u'video.google'
740 def __init__(self, downloader=None):
741 InfoExtractor.__init__(self, downloader)
743 def report_download_webpage(self, video_id):
744 """Report webpage download."""
745 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
747 def report_extraction(self, video_id):
748 """Report information extraction."""
749 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
751 def _real_extract(self, url):
752 # Extract id from URL
753 mobj = re.match(self._VALID_URL, url)
755 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
758 video_id = mobj.group(1)
760 video_extension = 'mp4'
762 # Retrieve video webpage to extract further information
763 request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
765 self.report_download_webpage(video_id)
766 webpage = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
771 # Extract URL, uploader, and title from webpage
772 self.report_extraction(video_id)
773 mobj = re.search(r"download_url:'([^']+)'", webpage)
775 video_extension = 'flv'
776 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
778 self._downloader.trouble(u'ERROR: unable to extract media URL')
780 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781 mediaURL = mediaURL.replace('\\x3d', '\x3d')
782 mediaURL = mediaURL.replace('\\x26', '\x26')
786 mobj = re.search(r'<title>(.*)</title>', webpage)
788 self._downloader.trouble(u'ERROR: unable to extract title')
790 video_title = mobj.group(1).decode('utf-8')
792 # Extract video description
793 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
795 self._downloader.trouble(u'ERROR: unable to extract video description')
797 video_description = mobj.group(1).decode('utf-8')
798 if not video_description:
799 video_description = 'No description available.'
801 # Extract video thumbnail
802 if self._downloader.params.get('forcethumbnail', False):
803 request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
805 webpage = compat_urllib_request.urlopen(request).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
809 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
811 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
813 video_thumbnail = mobj.group(1)
814 else: # we need something to pass to process_info
818 'id': video_id.decode('utf-8'),
819 'url': video_url.decode('utf-8'),
822 'title': video_title,
823 'ext': video_extension.decode('utf-8'),
827 class PhotobucketIE(InfoExtractor):
828 """Information extractor for photobucket.com."""
830 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
831 IE_NAME = u'photobucket'
833 def __init__(self, downloader=None):
834 InfoExtractor.__init__(self, downloader)
836 def report_download_webpage(self, video_id):
837 """Report webpage download."""
838 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
840 def report_extraction(self, video_id):
841 """Report information extraction."""
842 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
844 def _real_extract(self, url):
845 # Extract id from URL
846 mobj = re.match(self._VALID_URL, url)
848 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
851 video_id = mobj.group(1)
853 video_extension = 'flv'
855 # Retrieve video webpage to extract further information
856 request = compat_urllib_request.Request(url)
858 self.report_download_webpage(video_id)
859 webpage = compat_urllib_request.urlopen(request).read()
860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
864 # Extract URL, uploader, and title from webpage
865 self.report_extraction(video_id)
866 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
868 self._downloader.trouble(u'ERROR: unable to extract media URL')
870 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
874 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
876 self._downloader.trouble(u'ERROR: unable to extract title')
878 video_title = mobj.group(1).decode('utf-8')
880 video_uploader = mobj.group(2).decode('utf-8')
883 'id': video_id.decode('utf-8'),
884 'url': video_url.decode('utf-8'),
885 'uploader': video_uploader,
887 'title': video_title,
888 'ext': video_extension.decode('utf-8'),
892 class YahooIE(InfoExtractor):
893 """Information extractor for video.yahoo.com."""
895 # _VALID_URL matches all Yahoo! Video URLs
896 # _VPAGE_URL matches only the extractable '/watch/' URLs
897 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
898 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
899 IE_NAME = u'video.yahoo'
901 def __init__(self, downloader=None):
902 InfoExtractor.__init__(self, downloader)
904 def report_download_webpage(self, video_id):
905 """Report webpage download."""
906 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
908 def report_extraction(self, video_id):
909 """Report information extraction."""
910 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
912 def _real_extract(self, url, new_video=True):
913 # Extract ID from URL
914 mobj = re.match(self._VALID_URL, url)
916 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
919 video_id = mobj.group(2)
920 video_extension = 'flv'
922 # Rewrite valid but non-extractable URLs as
923 # extractable English language /watch/ URLs
924 if re.match(self._VPAGE_URL, url) is None:
925 request = compat_urllib_request.Request(url)
927 webpage = compat_urllib_request.urlopen(request).read()
928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
929 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
932 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
934 self._downloader.trouble(u'ERROR: Unable to extract id field')
936 yahoo_id = mobj.group(1)
938 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
940 self._downloader.trouble(u'ERROR: Unable to extract vid field')
942 yahoo_vid = mobj.group(1)
944 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
945 return self._real_extract(url, new_video=False)
947 # Retrieve video webpage to extract further information
948 request = compat_urllib_request.Request(url)
950 self.report_download_webpage(video_id)
951 webpage = compat_urllib_request.urlopen(request).read()
952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
953 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
956 # Extract uploader and title from webpage
957 self.report_extraction(video_id)
958 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
960 self._downloader.trouble(u'ERROR: unable to extract video title')
962 video_title = mobj.group(1).decode('utf-8')
964 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
966 self._downloader.trouble(u'ERROR: unable to extract video uploader')
968 video_uploader = mobj.group(1).decode('utf-8')
970 # Extract video thumbnail
971 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
973 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
975 video_thumbnail = mobj.group(1).decode('utf-8')
977 # Extract video description
978 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
980 self._downloader.trouble(u'ERROR: unable to extract video description')
982 video_description = mobj.group(1).decode('utf-8')
983 if not video_description:
984 video_description = 'No description available.'
986 # Extract video height and width
987 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
989 self._downloader.trouble(u'ERROR: unable to extract video height')
991 yv_video_height = mobj.group(1)
993 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
995 self._downloader.trouble(u'ERROR: unable to extract video width')
997 yv_video_width = mobj.group(1)
999 # Retrieve video playlist to extract media URL
1000 # I'm not completely sure what all these options are, but we
1001 # seem to need most of them, otherwise the server sends a 401.
1002 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1003 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1004 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1005 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1006 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1008 self.report_download_webpage(video_id)
1009 webpage = compat_urllib_request.urlopen(request).read()
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1014 # Extract media URL from playlist XML
1015 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1017 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1019 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1020 video_url = unescapeHTML(video_url)
1023 'id': video_id.decode('utf-8'),
1025 'uploader': video_uploader,
1026 'upload_date': None,
1027 'title': video_title,
1028 'ext': video_extension.decode('utf-8'),
1029 'thumbnail': video_thumbnail.decode('utf-8'),
1030 'description': video_description,
1034 class VimeoIE(InfoExtractor):
1035 """Information extractor for vimeo.com."""
1037 # _VALID_URL matches Vimeo URLs
1038 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1041 def __init__(self, downloader=None):
1042 InfoExtractor.__init__(self, downloader)
1044 def report_download_webpage(self, video_id):
1045 """Report webpage download."""
1046 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1048 def report_extraction(self, video_id):
1049 """Report information extraction."""
1050 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1052 def _real_extract(self, url, new_video=True):
1053 # Extract ID from URL
1054 mobj = re.match(self._VALID_URL, url)
1056 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1059 video_id = mobj.group(1)
1061 # Retrieve video webpage to extract further information
1062 request = compat_urllib_request.Request(url, None, std_headers)
1064 self.report_download_webpage(video_id)
1065 webpage_bytes = compat_urllib_request.urlopen(request).read()
1066 webpage = webpage_bytes.decode('utf-8')
1067 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1068 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1071 # Now we begin extracting as much information as we can from what we
1072 # retrieved. First we extract the information common to all extractors,
1073 # and latter we extract those that are Vimeo specific.
1074 self.report_extraction(video_id)
1076 # Extract the config JSON
1078 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1079 config = json.loads(config)
1081 self._downloader.trouble(u'ERROR: unable to extract info section')
1085 video_title = config["video"]["title"]
1088 video_uploader = config["video"]["owner"]["name"]
1090 # Extract video thumbnail
1091 video_thumbnail = config["video"]["thumbnail"]
1093 # Extract video description
1094 video_description = get_element_by_id("description", webpage)
1095 if video_description: video_description = clean_html(video_description)
1096 else: video_description = ''
1098 # Extract upload date
1099 video_upload_date = None
1100 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1101 if mobj is not None:
1102 video_upload_date = mobj.group(1)
1104 # Vimeo specific: extract request signature and timestamp
1105 sig = config['request']['signature']
1106 timestamp = config['request']['timestamp']
1108 # Vimeo specific: extract video codec and quality information
1109 # First consider quality, then codecs, then take everything
1110 # TODO bind to format param
1111 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1112 files = { 'hd': [], 'sd': [], 'other': []}
1113 for codec_name, codec_extension in codecs:
1114 if codec_name in config["video"]["files"]:
1115 if 'hd' in config["video"]["files"][codec_name]:
1116 files['hd'].append((codec_name, codec_extension, 'hd'))
1117 elif 'sd' in config["video"]["files"][codec_name]:
1118 files['sd'].append((codec_name, codec_extension, 'sd'))
1120 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1122 for quality in ('hd', 'sd', 'other'):
1123 if len(files[quality]) > 0:
1124 video_quality = files[quality][0][2]
1125 video_codec = files[quality][0][0]
1126 video_extension = files[quality][0][1]
1127 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1130 self._downloader.trouble(u'ERROR: no known codec found')
1133 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1134 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1139 'uploader': video_uploader,
1140 'upload_date': video_upload_date,
1141 'title': video_title,
1142 'ext': video_extension,
1143 'thumbnail': video_thumbnail,
1144 'description': video_description,
1148 class ArteTvIE(InfoExtractor):
1149 """arte.tv information extractor."""
1151 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1152 _LIVE_URL = r'index-[0-9]+\.html$'
1154 IE_NAME = u'arte.tv'
1156 def __init__(self, downloader=None):
1157 InfoExtractor.__init__(self, downloader)
1159 def report_download_webpage(self, video_id):
1160 """Report webpage download."""
1161 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1163 def report_extraction(self, video_id):
1164 """Report information extraction."""
1165 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1167 def fetch_webpage(self, url):
1168 self._downloader.increment_downloads()
1169 request = compat_urllib_request.Request(url)
1171 self.report_download_webpage(url)
1172 webpage = compat_urllib_request.urlopen(request).read()
1173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1174 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1176 except ValueError as err:
1177 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1182 page = self.fetch_webpage(url)
1183 mobj = re.search(regex, page, regexFlags)
1187 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190 for (i, key, err) in matchTuples:
1191 if mobj.group(i) is None:
1192 self._downloader.trouble(err)
1195 info[key] = mobj.group(i)
1199 def extractLiveStream(self, url):
1200 video_lang = url.split('/')[-4]
1201 info = self.grep_webpage(
1203 r'src="(.*?/videothek_js.*?\.js)',
1206 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1209 http_host = url.split('/')[2]
1210 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1211 info = self.grep_webpage(
1213 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1214 '(http://.*?\.swf).*?' +
1218 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1219 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1220 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1223 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1225 def extractPlus7Stream(self, url):
1226 video_lang = url.split('/')[-3]
1227 info = self.grep_webpage(
1229 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1235 next_url = compat_urllib_parse.unquote(info.get('url'))
1236 info = self.grep_webpage(
1238 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1244 next_url = compat_urllib_parse.unquote(info.get('url'))
1246 info = self.grep_webpage(
1248 r'<video id="(.*?)".*?>.*?' +
1249 '<name>(.*?)</name>.*?' +
1250 '<dateVideo>(.*?)</dateVideo>.*?' +
1251 '<url quality="hd">(.*?)</url>',
1254 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1255 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1256 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1257 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1262 'id': info.get('id'),
1263 'url': compat_urllib_parse.unquote(info.get('url')),
1264 'uploader': u'arte.tv',
1265 'upload_date': info.get('date'),
1266 'title': info.get('title'),
1272 def _real_extract(self, url):
1273 video_id = url.split('/')[-1]
1274 self.report_extraction(video_id)
1276 if re.search(self._LIVE_URL, video_id) is not None:
1277 self.extractLiveStream(url)
1280 info = self.extractPlus7Stream(url)
1285 class GenericIE(InfoExtractor):
1286 """Generic last-resort information extractor."""
1289 IE_NAME = u'generic'
1291 def __init__(self, downloader=None):
1292 InfoExtractor.__init__(self, downloader)
1294 def report_download_webpage(self, video_id):
1295 """Report webpage download."""
1296 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1297 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1299 def report_extraction(self, video_id):
1300 """Report information extraction."""
1301 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1303 def report_following_redirect(self, new_url):
1304 """Report information extraction."""
1305 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1307 def _test_redirect(self, url):
1308 """Check if it is a redirect, like url shorteners, in case restart chain."""
1309 class HeadRequest(compat_urllib_request.Request):
1310 def get_method(self):
1313 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1315 Subclass the HTTPRedirectHandler to make it use our
1316 HeadRequest also on the redirected URL
1318 def redirect_request(self, req, fp, code, msg, headers, newurl):
1319 if code in (301, 302, 303, 307):
1320 newurl = newurl.replace(' ', '%20')
1321 newheaders = dict((k,v) for k,v in req.headers.items()
1322 if k.lower() not in ("content-length", "content-type"))
1323 return HeadRequest(newurl,
1325 origin_req_host=req.get_origin_req_host(),
1328 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1330 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1332 Fallback to GET if HEAD is not allowed (405 HTTP error)
1334 def http_error_405(self, req, fp, code, msg, headers):
1338 newheaders = dict((k,v) for k,v in req.headers.items()
1339 if k.lower() not in ("content-length", "content-type"))
1340 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1342 origin_req_host=req.get_origin_req_host(),
1346 opener = compat_urllib_request.OpenerDirector()
1347 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1348 HTTPMethodFallback, HEADRedirectHandler,
1349 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1350 opener.add_handler(handler())
1352 response = opener.open(HeadRequest(url))
1353 new_url = response.geturl()
1358 self.report_following_redirect(new_url)
1359 self._downloader.download([new_url])
1362 def _real_extract(self, url):
1363 if self._test_redirect(url): return
1365 video_id = url.split('/')[-1]
1366 request = compat_urllib_request.Request(url)
1368 self.report_download_webpage(video_id)
1369 webpage = compat_urllib_request.urlopen(request).read()
1370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1371 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1373 except ValueError as err:
1374 # since this is the last-resort InfoExtractor, if
1375 # this error is thrown, it'll be thrown here
1376 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379 self.report_extraction(video_id)
1380 # Start with something easy: JW Player in SWFObject
1381 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1383 # Broaden the search a little bit
1384 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1386 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1389 # It's possible that one of the regexes
1390 # matched, but returned an empty group:
1391 if mobj.group(1) is None:
1392 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1395 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396 video_id = os.path.basename(video_url)
1398 # here's a fun little line of code for you:
1399 video_extension = os.path.splitext(video_id)[1][1:]
1400 video_id = os.path.splitext(video_id)[0]
1402 # it's tempting to parse this further, but you would
1403 # have to take into account all the variations like
1404 # Video Title - Site Name
1405 # Site Name | Video Title
1406 # Video Title - Tagline | Site Name
1407 # and so on and so forth; it's just not practical
1408 mobj = re.search(r'<title>(.*)</title>', webpage)
1410 self._downloader.trouble(u'ERROR: unable to extract title')
1412 video_title = mobj.group(1)
1414 # video uploader is domain name
1415 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1417 self._downloader.trouble(u'ERROR: unable to extract title')
1419 video_uploader = mobj.group(1)
1424 'uploader': video_uploader,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': video_extension,
1431 class YoutubeSearchIE(InfoExtractor):
1432 """Information Extractor for YouTube search queries."""
1433 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435 _max_youtube_results = 1000
1436 IE_NAME = u'youtube:search'
1438 def __init__(self, downloader=None):
1439 InfoExtractor.__init__(self, downloader)
1441 def report_download_page(self, query, pagenum):
1442 """Report attempt to download search page with given number."""
1443 query = query.decode(preferredencoding())
1444 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1446 def _real_extract(self, query):
1447 mobj = re.match(self._VALID_URL, query)
1449 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1452 prefix, query = query.split(':')
1454 query = query.encode('utf-8')
1456 self._download_n_results(query, 1)
1458 elif prefix == 'all':
1459 self._download_n_results(query, self._max_youtube_results)
1465 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1467 elif n > self._max_youtube_results:
1468 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1469 n = self._max_youtube_results
1470 self._download_n_results(query, n)
1472 except ValueError: # parsing prefix as integer fails
1473 self._download_n_results(query, 1)
1476 def _download_n_results(self, query, n):
1477 """Downloads a specified number of results for a query"""
1483 while (50 * pagenum) < limit:
1484 self.report_download_page(query, pagenum+1)
1485 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1486 request = compat_urllib_request.Request(result_url)
1488 data = compat_urllib_request.urlopen(request).read()
1489 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1490 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1492 api_response = json.loads(data)['data']
1494 new_ids = list(video['id'] for video in api_response['items'])
1495 video_ids += new_ids
1497 limit = min(n, api_response['totalItems'])
1500 if len(video_ids) > n:
1501 video_ids = video_ids[:n]
1502 for id in video_ids:
1503 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1507 class GoogleSearchIE(InfoExtractor):
1508 """Information Extractor for Google Video search queries."""
1509 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1510 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1511 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1512 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1513 _max_google_results = 1000
1514 IE_NAME = u'video.google:search'
1516 def __init__(self, downloader=None):
1517 InfoExtractor.__init__(self, downloader)
1519 def report_download_page(self, query, pagenum):
1520 """Report attempt to download playlist page with given number."""
1521 query = query.decode(preferredencoding())
1522 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1524 def _real_extract(self, query):
1525 mobj = re.match(self._VALID_URL, query)
1527 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1530 prefix, query = query.split(':')
1532 query = query.encode('utf-8')
1534 self._download_n_results(query, 1)
1536 elif prefix == 'all':
1537 self._download_n_results(query, self._max_google_results)
1543 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1545 elif n > self._max_google_results:
1546 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1547 n = self._max_google_results
1548 self._download_n_results(query, n)
1550 except ValueError: # parsing prefix as integer fails
1551 self._download_n_results(query, 1)
1554 def _download_n_results(self, query, n):
1555 """Downloads a specified number of results for a query"""
1561 self.report_download_page(query, pagenum)
1562 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1563 request = compat_urllib_request.Request(result_url)
1565 page = compat_urllib_request.urlopen(request).read()
1566 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1567 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1570 # Extract video identifiers
1571 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1572 video_id = mobj.group(1)
1573 if video_id not in video_ids:
1574 video_ids.append(video_id)
1575 if len(video_ids) == n:
1576 # Specified n videos reached
1577 for id in video_ids:
1578 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1581 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1582 for id in video_ids:
1583 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1586 pagenum = pagenum + 1
1589 class YahooSearchIE(InfoExtractor):
1590 """Information Extractor for Yahoo! Video search queries."""
1591 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1592 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1593 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1594 _MORE_PAGES_INDICATOR = r'\s*Next'
1595 _max_yahoo_results = 1000
1596 IE_NAME = u'video.yahoo:search'
1598 def __init__(self, downloader=None):
1599 InfoExtractor.__init__(self, downloader)
1601 def report_download_page(self, query, pagenum):
1602 """Report attempt to download playlist page with given number."""
1603 query = query.decode(preferredencoding())
1604 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1606 def _real_extract(self, query):
1607 mobj = re.match(self._VALID_URL, query)
1609 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1612 prefix, query = query.split(':')
1614 query = query.encode('utf-8')
1616 self._download_n_results(query, 1)
1618 elif prefix == 'all':
1619 self._download_n_results(query, self._max_yahoo_results)
1625 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1627 elif n > self._max_yahoo_results:
1628 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1629 n = self._max_yahoo_results
1630 self._download_n_results(query, n)
1632 except ValueError: # parsing prefix as integer fails
1633 self._download_n_results(query, 1)
1636 def _download_n_results(self, query, n):
1637 """Downloads a specified number of results for a query"""
1640 already_seen = set()
1644 self.report_download_page(query, pagenum)
1645 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1646 request = compat_urllib_request.Request(result_url)
1648 page = compat_urllib_request.urlopen(request).read()
1649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1650 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1653 # Extract video identifiers
1654 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1655 video_id = mobj.group(1)
1656 if video_id not in already_seen:
1657 video_ids.append(video_id)
1658 already_seen.add(video_id)
1659 if len(video_ids) == n:
1660 # Specified n videos reached
1661 for id in video_ids:
1662 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1665 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1666 for id in video_ids:
1667 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1670 pagenum = pagenum + 1
1673 class YoutubePlaylistIE(InfoExtractor):
1674 """Information Extractor for YouTube playlists."""
1676 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1677 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1678 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1679 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1680 IE_NAME = u'youtube:playlist'
1682 def __init__(self, downloader=None):
1683 InfoExtractor.__init__(self, downloader)
1685 def report_download_page(self, playlist_id, pagenum):
1686 """Report attempt to download playlist page with given number."""
1687 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1689 def _real_extract(self, url):
1690 # Extract playlist id
1691 mobj = re.match(self._VALID_URL, url)
1693 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1697 if mobj.group(3) is not None:
1698 self._downloader.download([mobj.group(3)])
1701 # Download playlist pages
1702 # prefix is 'p' as default for playlists but there are other types that need extra care
1703 playlist_prefix = mobj.group(1)
1704 if playlist_prefix == 'a':
1705 playlist_access = 'artist'
1707 playlist_prefix = 'p'
1708 playlist_access = 'view_play_list'
1709 playlist_id = mobj.group(2)
1714 self.report_download_page(playlist_id, pagenum)
1715 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1716 request = compat_urllib_request.Request(url)
1718 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1723 # Extract video identifiers
1725 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1726 if mobj.group(1) not in ids_in_page:
1727 ids_in_page.append(mobj.group(1))
1728 video_ids.extend(ids_in_page)
1730 if self._MORE_PAGES_INDICATOR not in page:
1732 pagenum = pagenum + 1
1734 total = len(video_ids)
1736 playliststart = self._downloader.params.get('playliststart', 1) - 1
1737 playlistend = self._downloader.params.get('playlistend', -1)
1738 if playlistend == -1:
1739 video_ids = video_ids[playliststart:]
1741 video_ids = video_ids[playliststart:playlistend]
1743 if len(video_ids) == total:
1744 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1746 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1748 for id in video_ids:
1749 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1753 class YoutubeChannelIE(InfoExtractor):
1754 """Information Extractor for YouTube channels."""
1756 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1757 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1758 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1759 IE_NAME = u'youtube:channel'
1761 def report_download_page(self, channel_id, pagenum):
1762 """Report attempt to download channel page with given number."""
1763 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1765 def _real_extract(self, url):
1766 # Extract channel id
1767 mobj = re.match(self._VALID_URL, url)
1769 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1772 # Download channel pages
1773 channel_id = mobj.group(1)
1778 self.report_download_page(channel_id, pagenum)
1779 url = self._TEMPLATE_URL % (channel_id, pagenum)
1780 request = compat_urllib_request.Request(url)
1782 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1783 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1784 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1787 # Extract video identifiers
1789 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1790 if mobj.group(1) not in ids_in_page:
1791 ids_in_page.append(mobj.group(1))
1792 video_ids.extend(ids_in_page)
1794 if self._MORE_PAGES_INDICATOR not in page:
1796 pagenum = pagenum + 1
1798 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1800 for id in video_ids:
1801 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1805 class YoutubeUserIE(InfoExtractor):
1806 """Information Extractor for YouTube users."""
1808 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1809 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1810 _GDATA_PAGE_SIZE = 50
1811 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1812 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1813 IE_NAME = u'youtube:user'
1815 def __init__(self, downloader=None):
1816 InfoExtractor.__init__(self, downloader)
1818 def report_download_page(self, username, start_index):
1819 """Report attempt to download user page."""
1820 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1821 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1823 def _real_extract(self, url):
1825 mobj = re.match(self._VALID_URL, url)
1827 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1830 username = mobj.group(1)
1832 # Download video ids using YouTube Data API. Result size per
1833 # query is limited (currently to 50 videos) so we need to query
1834 # page by page until there are no video ids - it means we got
1841 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1842 self.report_download_page(username, start_index)
1844 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1847 page = compat_urllib_request.urlopen(request).read()
1848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1849 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1852 # Extract video identifiers
1855 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1856 if mobj.group(1) not in ids_in_page:
1857 ids_in_page.append(mobj.group(1))
1859 video_ids.extend(ids_in_page)
1861 # A little optimization - if current page is not
1862 # "full", ie. does not contain PAGE_SIZE video ids then
1863 # we can assume that this page is the last one - there
1864 # are no more ids on further pages - no need to query
1867 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1872 all_ids_count = len(video_ids)
1873 playliststart = self._downloader.params.get('playliststart', 1) - 1
1874 playlistend = self._downloader.params.get('playlistend', -1)
1876 if playlistend == -1:
1877 video_ids = video_ids[playliststart:]
1879 video_ids = video_ids[playliststart:playlistend]
1881 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1882 (username, all_ids_count, len(video_ids)))
1884 for video_id in video_ids:
1885 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1888 class BlipTVUserIE(InfoExtractor):
1889 """Information Extractor for blip.tv users."""
1891 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1893 IE_NAME = u'blip.tv:user'
1895 def __init__(self, downloader=None):
1896 InfoExtractor.__init__(self, downloader)
1898 def report_download_page(self, username, pagenum):
1899 """Report attempt to download user page."""
1900 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1901 (self.IE_NAME, username, pagenum))
1903 def _real_extract(self, url):
1905 mobj = re.match(self._VALID_URL, url)
1907 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1910 username = mobj.group(1)
1912 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1914 request = compat_urllib_request.Request(url)
1917 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918 mobj = re.search(r'data-users-id="([^"]+)"', page)
1919 page_base = page_base % mobj.group(1)
1920 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1921 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1925 # Download video ids using BlipTV Ajax calls. Result size per
1926 # query is limited (currently to 12 videos) so we need to query
1927 # page by page until there are no video ids - it means we got
1934 self.report_download_page(username, pagenum)
1936 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1939 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1940 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1944 # Extract video identifiers
1947 for mobj in re.finditer(r'href="/([^"]+)"', page):
1948 if mobj.group(1) not in ids_in_page:
1949 ids_in_page.append(unescapeHTML(mobj.group(1)))
1951 video_ids.extend(ids_in_page)
1953 # A little optimization - if current page is not
1954 # "full", ie. does not contain PAGE_SIZE video ids then
1955 # we can assume that this page is the last one - there
1956 # are no more ids on further pages - no need to query
1959 if len(ids_in_page) < self._PAGE_SIZE:
1964 all_ids_count = len(video_ids)
1965 playliststart = self._downloader.params.get('playliststart', 1) - 1
1966 playlistend = self._downloader.params.get('playlistend', -1)
1968 if playlistend == -1:
1969 video_ids = video_ids[playliststart:]
1971 video_ids = video_ids[playliststart:playlistend]
1973 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1974 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1976 for video_id in video_ids:
1977 self._downloader.download([u'http://blip.tv/'+video_id])
1980 class DepositFilesIE(InfoExtractor):
1981 """Information extractor for depositfiles.com"""
1983 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1984 IE_NAME = u'DepositFiles'
1986 def __init__(self, downloader=None):
1987 InfoExtractor.__init__(self, downloader)
1989 def report_download_webpage(self, file_id):
1990 """Report webpage download."""
1991 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1993 def report_extraction(self, file_id):
1994 """Report information extraction."""
1995 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1997 def _real_extract(self, url):
1998 file_id = url.split('/')[-1]
1999 # Rebuild url in english locale
2000 url = 'http://depositfiles.com/en/files/' + file_id
2002 # Retrieve file webpage with 'Free download' button pressed
2003 free_download_indication = { 'gateway_result' : '1' }
2004 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2006 self.report_download_webpage(file_id)
2007 webpage = compat_urllib_request.urlopen(request).read()
2008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2009 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2012 # Search for the real file URL
2013 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2014 if (mobj is None) or (mobj.group(1) is None):
2015 # Try to figure out reason of the error.
2016 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2017 if (mobj is not None) and (mobj.group(1) is not None):
2018 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2019 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2021 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2024 file_url = mobj.group(1)
2025 file_extension = os.path.splitext(file_url)[1][1:]
2027 # Search for file title
2028 mobj = re.search(r'<b title="(.*?)">', webpage)
2030 self._downloader.trouble(u'ERROR: unable to extract title')
2032 file_title = mobj.group(1).decode('utf-8')
2035 'id': file_id.decode('utf-8'),
2036 'url': file_url.decode('utf-8'),
2038 'upload_date': None,
2039 'title': file_title,
2040 'ext': file_extension.decode('utf-8'),
2044 class FacebookIE(InfoExtractor):
2045 """Information Extractor for Facebook"""
2048 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2049 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2050 _NETRC_MACHINE = 'facebook'
2051 _available_formats = ['video', 'highqual', 'lowqual']
2052 _video_extensions = {
2057 IE_NAME = u'facebook'
2059 def __init__(self, downloader=None):
2060 InfoExtractor.__init__(self, downloader)
2062 def _reporter(self, message):
2063 """Add header and report message."""
2064 self._downloader.to_screen(u'[facebook] %s' % message)
2066 def report_login(self):
2067 """Report attempt to log in."""
2068 self._reporter(u'Logging in')
2070 def report_video_webpage_download(self, video_id):
2071 """Report attempt to download video webpage."""
2072 self._reporter(u'%s: Downloading video webpage' % video_id)
2074 def report_information_extraction(self, video_id):
2075 """Report attempt to extract video information."""
2076 self._reporter(u'%s: Extracting video information' % video_id)
2078 def _parse_page(self, video_webpage):
2079 """Extract video information from page"""
2081 data = {'title': r'\("video_title", "(.*?)"\)',
2082 'description': r'<div class="datawrap">(.*?)</div>',
2083 'owner': r'\("video_owner_name", "(.*?)"\)',
2084 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2087 for piece in data.keys():
2088 mobj = re.search(data[piece], video_webpage)
2089 if mobj is not None:
2090 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2094 for fmt in self._available_formats:
2095 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2096 if mobj is not None:
2097 # URL is in a Javascript segment inside an escaped Unicode format within
2098 # the generally utf-8 page
2099 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2100 video_info['video_urls'] = video_urls
2104 def _real_initialize(self):
2105 if self._downloader is None:
2110 downloader_params = self._downloader.params
2112 # Attempt to use provided username and password or .netrc data
2113 if downloader_params.get('username', None) is not None:
2114 useremail = downloader_params['username']
2115 password = downloader_params['password']
2116 elif downloader_params.get('usenetrc', False):
2118 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2119 if info is not None:
2123 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2124 except (IOError, netrc.NetrcParseError) as err:
2125 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2128 if useremail is None:
2137 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2140 login_results = compat_urllib_request.urlopen(request).read()
2141 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2142 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2144 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2145 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2148 def _real_extract(self, url):
2149 mobj = re.match(self._VALID_URL, url)
2151 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2153 video_id = mobj.group('ID')
2156 self.report_video_webpage_download(video_id)
2157 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2159 page = compat_urllib_request.urlopen(request)
2160 video_webpage = page.read()
2161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2162 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2165 # Start extracting information
2166 self.report_information_extraction(video_id)
2168 # Extract information
2169 video_info = self._parse_page(video_webpage)
2172 if 'owner' not in video_info:
2173 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2175 video_uploader = video_info['owner']
2178 if 'title' not in video_info:
2179 self._downloader.trouble(u'ERROR: unable to extract video title')
2181 video_title = video_info['title']
2182 video_title = video_title.decode('utf-8')
2185 if 'thumbnail' not in video_info:
2186 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2187 video_thumbnail = ''
2189 video_thumbnail = video_info['thumbnail']
2193 if 'upload_date' in video_info:
2194 upload_time = video_info['upload_date']
2195 timetuple = email.utils.parsedate_tz(upload_time)
2196 if timetuple is not None:
2198 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2203 video_description = video_info.get('description', 'No description available.')
2205 url_map = video_info['video_urls']
2206 if len(url_map.keys()) > 0:
2207 # Decide which formats to download
2208 req_format = self._downloader.params.get('format', None)
2209 format_limit = self._downloader.params.get('format_limit', None)
2211 if format_limit is not None and format_limit in self._available_formats:
2212 format_list = self._available_formats[self._available_formats.index(format_limit):]
2214 format_list = self._available_formats
2215 existing_formats = [x for x in format_list if x in url_map]
2216 if len(existing_formats) == 0:
2217 self._downloader.trouble(u'ERROR: no known formats available for video')
2219 if req_format is None:
2220 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2221 elif req_format == 'worst':
2222 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2223 elif req_format == '-1':
2224 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2227 if req_format not in url_map:
2228 self._downloader.trouble(u'ERROR: requested format not available')
2230 video_url_list = [(req_format, url_map[req_format])] # Specific format
2233 for format_param, video_real_url in video_url_list:
2235 video_extension = self._video_extensions.get(format_param, 'mp4')
2238 'id': video_id.decode('utf-8'),
2239 'url': video_real_url.decode('utf-8'),
2240 'uploader': video_uploader.decode('utf-8'),
2241 'upload_date': upload_date,
2242 'title': video_title,
2243 'ext': video_extension.decode('utf-8'),
2244 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2245 'thumbnail': video_thumbnail.decode('utf-8'),
2246 'description': video_description.decode('utf-8'),
2250 class BlipTVIE(InfoExtractor):
2251 """Information extractor for blip.tv"""
2253 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2254 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2255 IE_NAME = u'blip.tv'
2257 def report_extraction(self, file_id):
2258 """Report information extraction."""
2259 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2261 def report_direct_download(self, title):
2262 """Report information extraction."""
2263 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2265 def _real_extract(self, url):
2266 mobj = re.match(self._VALID_URL, url)
2268 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2275 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2276 request = compat_urllib_request.Request(json_url)
2277 self.report_extraction(mobj.group(1))
2280 urlh = compat_urllib_request.urlopen(request)
2281 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2282 basename = url.split('/')[-1]
2283 title,ext = os.path.splitext(basename)
2284 title = title.decode('UTF-8')
2285 ext = ext.replace('.', '')
2286 self.report_direct_download(title)
2291 'upload_date': None,
2296 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2297 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2299 if info is None: # Regular URL
2301 json_code_bytes = urlh.read()
2302 json_code = json_code_bytes.decode('utf-8')
2303 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2304 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2308 json_data = json.loads(json_code)
2309 if 'Post' in json_data:
2310 data = json_data['Post']
2314 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2315 video_url = data['media']['url']
2316 umobj = re.match(self._URL_EXT, video_url)
2318 raise ValueError('Can not determine filename extension')
2319 ext = umobj.group(1)
2322 'id': data['item_id'],
2324 'uploader': data['display_name'],
2325 'upload_date': upload_date,
2326 'title': data['title'],
2328 'format': data['media']['mimeType'],
2329 'thumbnail': data['thumbnailUrl'],
2330 'description': data['description'],
2331 'player_url': data['embedUrl']
2333 except (ValueError,KeyError) as err:
2334 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2337 std_headers['User-Agent'] = 'iTunes/10.6.1'
2341 class MyVideoIE(InfoExtractor):
2342 """Information Extractor for myvideo.de."""
2344 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2345 IE_NAME = u'myvideo'
2347 def __init__(self, downloader=None):
2348 InfoExtractor.__init__(self, downloader)
2350 def report_download_webpage(self, video_id):
2351 """Report webpage download."""
2352 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2354 def report_extraction(self, video_id):
2355 """Report information extraction."""
2356 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2358 def _real_extract(self,url):
2359 mobj = re.match(self._VALID_URL, url)
2361 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2364 video_id = mobj.group(1)
2367 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2369 self.report_download_webpage(video_id)
2370 webpage = compat_urllib_request.urlopen(request).read()
2371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2372 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2375 self.report_extraction(video_id)
2376 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2379 self._downloader.trouble(u'ERROR: unable to extract media URL')
2381 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2383 mobj = re.search('<title>([^<]+)</title>', webpage)
2385 self._downloader.trouble(u'ERROR: unable to extract title')
2388 video_title = mobj.group(1)
2394 'upload_date': None,
2395 'title': video_title,
2399 class ComedyCentralIE(InfoExtractor):
2400 """Information extractor for The Daily Show and Colbert Report """
2402 # urls can be abbreviations like :thedailyshow or :colbert
2403 # urls for episodes like:
2404 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2405 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2406 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2407 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2408 |(https?://)?(www\.)?
2409 (?P<showname>thedailyshow|colbertnation)\.com/
2410 (full-episodes/(?P<episode>.*)|
2412 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2413 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2415 IE_NAME = u'comedycentral'
2417 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2419 _video_extensions = {
2427 _video_dimensions = {
2436 def suitable(self, url):
2437 """Receives a URL and returns True if suitable for this IE."""
2438 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2440 def report_extraction(self, episode_id):
2441 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2443 def report_config_download(self, episode_id):
2444 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2446 def report_index_download(self, episode_id):
2447 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2449 def report_player_url(self, episode_id):
2450 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2453 def _print_formats(self, formats):
2454 print('Available formats:')
2456 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2459 def _real_extract(self, url):
2460 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2462 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2465 if mobj.group('shortname'):
2466 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2467 url = u'http://www.thedailyshow.com/full-episodes/'
2469 url = u'http://www.colbertnation.com/full-episodes/'
2470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2471 assert mobj is not None
2473 if mobj.group('clip'):
2474 if mobj.group('showname') == 'thedailyshow':
2475 epTitle = mobj.group('tdstitle')
2477 epTitle = mobj.group('cntitle')
2480 dlNewest = not mobj.group('episode')
2482 epTitle = mobj.group('showname')
2484 epTitle = mobj.group('episode')
2486 req = compat_urllib_request.Request(url)
2487 self.report_extraction(epTitle)
2489 htmlHandle = compat_urllib_request.urlopen(req)
2490 html = htmlHandle.read()
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2495 url = htmlHandle.geturl()
2496 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2498 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2500 if mobj.group('episode') == '':
2501 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2503 epTitle = mobj.group('episode')
2505 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2507 if len(mMovieParams) == 0:
2508 # The Colbert Report embeds the information in a without
2509 # a URL prefix; so extract the alternate reference
2510 # and then add the URL prefix manually.
2512 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2513 if len(altMovieParams) == 0:
2514 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2517 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2519 playerUrl_raw = mMovieParams[0][0]
2520 self.report_player_url(epTitle)
2522 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2523 playerUrl = urlHandle.geturl()
2524 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2525 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2528 uri = mMovieParams[0][1]
2529 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2530 self.report_index_download(epTitle)
2532 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2533 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2534 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2539 idoc = xml.etree.ElementTree.fromstring(indexXml)
2540 itemEls = idoc.findall('.//item')
2541 for itemEl in itemEls:
2542 mediaId = itemEl.findall('./guid')[0].text
2543 shortMediaId = mediaId.split(':')[-1]
2544 showId = mediaId.split(':')[-2].replace('.com', '')
2545 officialTitle = itemEl.findall('./title')[0].text
2546 officialDate = itemEl.findall('./pubDate')[0].text
2548 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2549 compat_urllib_parse.urlencode({'uri': mediaId}))
2550 configReq = compat_urllib_request.Request(configUrl)
2551 self.report_config_download(epTitle)
2553 configXml = compat_urllib_request.urlopen(configReq).read()
2554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2555 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2558 cdoc = xml.etree.ElementTree.fromstring(configXml)
2560 for rendition in cdoc.findall('.//rendition'):
2561 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2565 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2568 if self._downloader.params.get('listformats', None):
2569 self._print_formats([i[0] for i in turls])
2572 # For now, just pick the highest bitrate
2573 format,video_url = turls[-1]
2575 # Get the format arg from the arg stream
2576 req_format = self._downloader.params.get('format', None)
2578 # Select format if we can find one
2581 format, video_url = f, v
2584 # Patch to download from alternative CDN, which does not
2585 # break on current RTMPDump builds
2586 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2587 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2589 if video_url.startswith(broken_cdn):
2590 video_url = video_url.replace(broken_cdn, better_cdn)
2592 effTitle = showId + u'-' + epTitle
2597 'upload_date': officialDate,
2602 'description': officialTitle,
2603 'player_url': None #playerUrl
2606 results.append(info)
2611 class EscapistIE(InfoExtractor):
2612 """Information extractor for The Escapist """
2614 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2615 IE_NAME = u'escapist'
2617 def report_extraction(self, showName):
2618 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2620 def report_config_download(self, showName):
2621 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2623 def _real_extract(self, url):
2624 mobj = re.match(self._VALID_URL, url)
2626 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2628 showName = mobj.group('showname')
2629 videoId = mobj.group('episode')
2631 self.report_extraction(showName)
2633 webPage = compat_urllib_request.urlopen(url)
2634 webPageBytes = webPage.read()
2635 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2636 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2637 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2638 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2641 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2642 description = unescapeHTML(descMatch.group(1))
2643 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2644 imgUrl = unescapeHTML(imgMatch.group(1))
2645 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2646 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2647 configUrlMatch = re.search('config=(.*)$', playerUrl)
2648 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2650 self.report_config_download(showName)
2652 configJSON = compat_urllib_request.urlopen(configUrl).read()
2653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2654 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2657 # Technically, it's JavaScript, not JSON
2658 configJSON = configJSON.replace("'", '"')
2661 config = json.loads(configJSON)
2662 except (ValueError,) as err:
2663 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2666 playlist = config['playlist']
2667 videoUrl = playlist[1]['url']
2672 'uploader': showName,
2673 'upload_date': None,
2676 'thumbnail': imgUrl,
2677 'description': description,
2678 'player_url': playerUrl,
2684 class CollegeHumorIE(InfoExtractor):
2685 """Information extractor for collegehumor.com"""
2688 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2689 IE_NAME = u'collegehumor'
2691 def report_manifest(self, video_id):
2692 """Report information extraction."""
2693 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2695 def report_extraction(self, video_id):
2696 """Report information extraction."""
2697 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2704 video_id = mobj.group('videoid')
2709 'upload_date': None,
2712 self.report_extraction(video_id)
2713 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2715 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2716 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2717 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2720 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2722 videoNode = mdoc.findall('./video')[0]
2723 info['description'] = videoNode.findall('./description')[0].text
2724 info['title'] = videoNode.findall('./caption')[0].text
2725 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2726 manifest_url = videoNode.findall('./file')[0].text
2728 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2731 manifest_url += '?hdcore=2.10.3'
2732 self.report_manifest(video_id)
2734 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2739 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2741 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2742 node_id = media_node.attrib['url']
2743 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2744 except IndexError as err:
2745 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2748 url_pr = compat_urllib_parse_urlparse(manifest_url)
2749 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2756 class XVideosIE(InfoExtractor):
2757 """Information extractor for xvideos.com"""
2759 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2760 IE_NAME = u'xvideos'
2762 def report_webpage(self, video_id):
2763 """Report information extraction."""
2764 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2766 def report_extraction(self, video_id):
2767 """Report information extraction."""
2768 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2770 def _real_extract(self, url):
2771 mobj = re.match(self._VALID_URL, url)
2773 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2775 video_id = mobj.group(1)
2777 self.report_webpage(video_id)
2779 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2781 webpage_bytes = compat_urllib_request.urlopen(request).read()
2782 webpage = webpage_bytes.decode('utf-8', 'replace')
2783 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2784 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2787 self.report_extraction(video_id)
2791 mobj = re.search(r'flv_url=(.+?)&', webpage)
2793 self._downloader.trouble(u'ERROR: unable to extract video url')
2795 video_url = compat_urllib_parse.unquote(mobj.group(1))
2799 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2801 self._downloader.trouble(u'ERROR: unable to extract video title')
2803 video_title = mobj.group(1)
2806 # Extract video thumbnail
2807 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2809 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2811 video_thumbnail = mobj.group(0)
2817 'upload_date': None,
2818 'title': video_title,
2820 'thumbnail': video_thumbnail,
2821 'description': None,
2827 class SoundcloudIE(InfoExtractor):
2828 """Information extractor for soundcloud.com
2829 To access the media, the uid of the song and a stream token
2830 must be extracted from the page source and the script must make
2831 a request to media.soundcloud.com/crossdomain.xml. Then
2832 the media can be grabbed by requesting from an url composed
2833 of the stream token and uid
2836 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2837 IE_NAME = u'soundcloud'
2839 def __init__(self, downloader=None):
2840 InfoExtractor.__init__(self, downloader)
2842 def report_resolve(self, video_id):
2843 """Report information extraction."""
2844 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2846 def report_extraction(self, video_id):
2847 """Report information extraction."""
2848 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2850 def _real_extract(self, url):
2851 mobj = re.match(self._VALID_URL, url)
2853 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2856 # extract uploader (which is in the url)
2857 uploader = mobj.group(1)
2858 # extract simple title (uploader + slug of song title)
2859 slug_title = mobj.group(2)
2860 simple_title = uploader + u'-' + slug_title
2862 self.report_resolve('%s/%s' % (uploader, slug_title))
2864 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2865 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2866 request = compat_urllib_request.Request(resolv_url)
2868 info_json_bytes = compat_urllib_request.urlopen(request).read()
2869 info_json = info_json_bytes.decode('utf-8')
2870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2874 info = json.loads(info_json)
2875 video_id = info['id']
2876 self.report_extraction('%s/%s' % (uploader, slug_title))
2878 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2879 request = compat_urllib_request.Request(streams_url)
2881 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2882 stream_json = stream_json_bytes.decode('utf-8')
2883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2884 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2887 streams = json.loads(stream_json)
2888 mediaURL = streams['http_mp3_128_url']
2893 'uploader': info['user']['username'],
2894 'upload_date': info['created_at'],
2895 'title': info['title'],
2897 'description': info['description'],
2901 class InfoQIE(InfoExtractor):
2902 """Information extractor for infoq.com"""
2904 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2907 def report_webpage(self, video_id):
2908 """Report information extraction."""
2909 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2911 def report_extraction(self, video_id):
2912 """Report information extraction."""
2913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2915 def _real_extract(self, url):
2916 mobj = re.match(self._VALID_URL, url)
2918 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2921 self.report_webpage(url)
2923 request = compat_urllib_request.Request(url)
2925 webpage = compat_urllib_request.urlopen(request).read()
2926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2927 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2930 self.report_extraction(url)
2934 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2936 self._downloader.trouble(u'ERROR: unable to extract video url')
2938 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2942 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2944 self._downloader.trouble(u'ERROR: unable to extract video title')
2946 video_title = mobj.group(1).decode('utf-8')
2948 # Extract description
2949 video_description = u'No description available.'
2950 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2951 if mobj is not None:
2952 video_description = mobj.group(1).decode('utf-8')
2954 video_filename = video_url.split('/')[-1]
2955 video_id, extension = video_filename.split('.')
2961 'upload_date': None,
2962 'title': video_title,
2963 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2965 'description': video_description,
2970 class MixcloudIE(InfoExtractor):
2971 """Information extractor for www.mixcloud.com"""
2972 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2973 IE_NAME = u'mixcloud'
2975 def __init__(self, downloader=None):
2976 InfoExtractor.__init__(self, downloader)
2978 def report_download_json(self, file_id):
2979 """Report JSON download."""
2980 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2982 def report_extraction(self, file_id):
2983 """Report information extraction."""
2984 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2986 def get_urls(self, jsonData, fmt, bitrate='best'):
2987 """Get urls from 'audio_formats' section in json"""
2990 bitrate_list = jsonData[fmt]
2991 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2992 bitrate = max(bitrate_list) # select highest
2994 url_list = jsonData[fmt][bitrate]
2995 except TypeError: # we have no bitrate info.
2996 url_list = jsonData[fmt]
2999 def check_urls(self, url_list):
3000 """Returns 1st active url from list"""
3001 for url in url_list:
3003 compat_urllib_request.urlopen(url)
3005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3010 def _print_formats(self, formats):
3011 print('Available formats:')
3012 for fmt in formats.keys():
3013 for b in formats[fmt]:
3015 ext = formats[fmt][b][0]
3016 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3017 except TypeError: # we have no bitrate info
3018 ext = formats[fmt][0]
3019 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3022 def _real_extract(self, url):
3023 mobj = re.match(self._VALID_URL, url)
3025 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3027 # extract uploader & filename from url
3028 uploader = mobj.group(1).decode('utf-8')
3029 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3031 # construct API request
3032 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3033 # retrieve .json file with links to files
3034 request = compat_urllib_request.Request(file_url)
3036 self.report_download_json(file_url)
3037 jsonData = compat_urllib_request.urlopen(request).read()
3038 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3039 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
3043 json_data = json.loads(jsonData)
3044 player_url = json_data['player_swf_url']
3045 formats = dict(json_data['audio_formats'])
3047 req_format = self._downloader.params.get('format', None)
3050 if self._downloader.params.get('listformats', None):
3051 self._print_formats(formats)
3054 if req_format is None or req_format == 'best':
3055 for format_param in formats.keys():
3056 url_list = self.get_urls(formats, format_param)
3058 file_url = self.check_urls(url_list)
3059 if file_url is not None:
3062 if req_format not in formats.keys():
3063 self._downloader.trouble(u'ERROR: format is not available')
3066 url_list = self.get_urls(formats, req_format)
3067 file_url = self.check_urls(url_list)
3068 format_param = req_format
3071 'id': file_id.decode('utf-8'),
3072 'url': file_url.decode('utf-8'),
3073 'uploader': uploader.decode('utf-8'),
3074 'upload_date': None,
3075 'title': json_data['name'],
3076 'ext': file_url.split('.')[-1].decode('utf-8'),
3077 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3078 'thumbnail': json_data['thumbnail_url'],
3079 'description': json_data['description'],
3080 'player_url': player_url.decode('utf-8'),
3083 class StanfordOpenClassroomIE(InfoExtractor):
3084 """Information extractor for Stanford's Open ClassRoom"""
3086 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3087 IE_NAME = u'stanfordoc'
3089 def report_download_webpage(self, objid):
3090 """Report information extraction."""
3091 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3093 def report_extraction(self, video_id):
3094 """Report information extraction."""
3095 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3097 def _real_extract(self, url):
3098 mobj = re.match(self._VALID_URL, url)
3100 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3103 if mobj.group('course') and mobj.group('video'): # A specific video
3104 course = mobj.group('course')
3105 video = mobj.group('video')
3107 'id': course + '_' + video,
3109 'upload_date': None,
3112 self.report_extraction(info['id'])
3113 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3114 xmlUrl = baseUrl + video + '.xml'
3116 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3118 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3120 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3122 info['title'] = mdoc.findall('./title')[0].text
3123 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3125 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3127 info['ext'] = info['url'].rpartition('.')[2]
3129 elif mobj.group('course'): # A course page
3130 course = mobj.group('course')
3135 'upload_date': None,
3138 self.report_download_webpage(info['id'])
3140 coursepage = compat_urllib_request.urlopen(url).read()
3141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3145 m = re.search('<h1>([^<]+)</h1>', coursepage)
3147 info['title'] = unescapeHTML(m.group(1))
3149 info['title'] = info['id']
3151 m = re.search('<description>([^<]+)</description>', coursepage)
3153 info['description'] = unescapeHTML(m.group(1))
3155 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3158 'type': 'reference',
3159 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3163 for entry in info['list']:
3164 assert entry['type'] == 'reference'
3165 results += self.extract(entry['url'])
3170 'id': 'Stanford OpenClassroom',
3173 'upload_date': None,
3176 self.report_download_webpage(info['id'])
3177 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3179 rootpage = compat_urllib_request.urlopen(rootURL).read()
3180 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3181 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3184 info['title'] = info['id']
3186 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3189 'type': 'reference',
3190 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3195 for entry in info['list']:
3196 assert entry['type'] == 'reference'
3197 results += self.extract(entry['url'])
3200 class MTVIE(InfoExtractor):
3201 """Information extractor for MTV.com"""
3203 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3206 def report_webpage(self, video_id):
3207 """Report information extraction."""
3208 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3210 def report_extraction(self, video_id):
3211 """Report information extraction."""
3212 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3214 def _real_extract(self, url):
3215 mobj = re.match(self._VALID_URL, url)
3217 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3219 if not mobj.group('proto'):
3220 url = 'http://' + url
3221 video_id = mobj.group('videoid')
3222 self.report_webpage(video_id)
3224 request = compat_urllib_request.Request(url)
3226 webpage = compat_urllib_request.urlopen(request).read()
3227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3228 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3231 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3233 self._downloader.trouble(u'ERROR: unable to extract song name')
3235 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3236 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3238 self._downloader.trouble(u'ERROR: unable to extract performer')
3240 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3241 video_title = performer + ' - ' + song_name
3243 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3245 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3247 mtvn_uri = mobj.group(1)
3249 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3251 self._downloader.trouble(u'ERROR: unable to extract content id')
3253 content_id = mobj.group(1)
3255 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3256 self.report_extraction(video_id)
3257 request = compat_urllib_request.Request(videogen_url)
3259 metadataXml = compat_urllib_request.urlopen(request).read()
3260 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3261 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3264 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3265 renditions = mdoc.findall('.//rendition')
3267 # For now, always pick the highest quality.
3268 rendition = renditions[-1]
3271 _,_,ext = rendition.attrib['type'].partition('/')
3272 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3273 video_url = rendition.find('./src').text
3275 self._downloader.trouble('Invalid rendition field.')
3281 'uploader': performer,
3282 'upload_date': None,
3283 'title': video_title,
3291 class YoukuIE(InfoExtractor):
3293 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3296 def __init__(self, downloader=None):
3297 InfoExtractor.__init__(self, downloader)
3299 def report_download_webpage(self, file_id):
3300 """Report webpage download."""
3301 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3303 def report_extraction(self, file_id):
3304 """Report information extraction."""
3305 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3308 nowTime = int(time.time() * 1000)
3309 random1 = random.randint(1000,1998)
3310 random2 = random.randint(1000,9999)
3312 return "%d%d%d" %(nowTime,random1,random2)
3314 def _get_file_ID_mix_string(self, seed):
3316 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3318 for i in range(len(source)):
3319 seed = (seed * 211 + 30031 ) % 65536
3320 index = math.floor(seed / 65536 * len(source) )
3321 mixed.append(source[int(index)])
3322 source.remove(source[int(index)])
3323 #return ''.join(mixed)
3326 def _get_file_id(self, fileId, seed):
3327 mixed = self._get_file_ID_mix_string(seed)
3328 ids = fileId.split('*')
3332 realId.append(mixed[int(ch)])
3333 return ''.join(realId)
3335 def _real_extract(self, url):
3336 mobj = re.match(self._VALID_URL, url)
3338 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3340 video_id = mobj.group('ID')
3342 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3344 request = compat_urllib_request.Request(info_url, None, std_headers)
3346 self.report_download_webpage(video_id)
3347 jsondata = compat_urllib_request.urlopen(request).read()
3348 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3349 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3352 self.report_extraction(video_id)
3354 jsonstr = jsondata.decode('utf-8')
3355 config = json.loads(jsonstr)
3357 video_title = config['data'][0]['title']
3358 seed = config['data'][0]['seed']
3360 format = self._downloader.params.get('format', None)
3361 supported_format = config['data'][0]['streamfileids'].keys()
3363 if format is None or format == 'best':
3364 if 'hd2' in supported_format:
3369 elif format == 'worst':
3377 fileid = config['data'][0]['streamfileids'][format]
3378 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3379 except (UnicodeDecodeError, ValueError, KeyError):
3380 self._downloader.trouble(u'ERROR: unable to extract info section')
3384 sid = self._gen_sid()
3385 fileid = self._get_file_id(fileid, seed)
3387 #column 8,9 of fileid represent the segment number
3388 #fileid[7:9] should be changed
3389 for index, key in enumerate(keys):
3391 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3392 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3395 'id': '%s_part%02d' % (video_id, index),
3396 'url': download_url,
3398 'upload_date': None,
3399 'title': video_title,
3402 files_info.append(info)
3407 class XNXXIE(InfoExtractor):
3408 """Information extractor for xnxx.com"""
3410 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3412 VIDEO_URL_RE = r'flv_url=(.*?)&'
3413 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3414 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3416 def report_webpage(self, video_id):
3417 """Report information extraction"""
3418 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3420 def report_extraction(self, video_id):
3421 """Report information extraction"""
3422 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3424 def _real_extract(self, url):
3425 mobj = re.match(self._VALID_URL, url)
3427 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3429 video_id = mobj.group(1)
3431 self.report_webpage(video_id)
3433 # Get webpage content
3435 webpage_bytes = compat_urllib_request.urlopen(url).read()
3436 webpage = webpage_bytes.decode('utf-8')
3437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3438 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3441 result = re.search(self.VIDEO_URL_RE, webpage)
3443 self._downloader.trouble(u'ERROR: unable to extract video url')
3445 video_url = compat_urllib_parse.unquote(result.group(1))
3447 result = re.search(self.VIDEO_TITLE_RE, webpage)
3449 self._downloader.trouble(u'ERROR: unable to extract video title')
3451 video_title = result.group(1)
3453 result = re.search(self.VIDEO_THUMB_RE, webpage)
3455 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3457 video_thumbnail = result.group(1)
3463 'upload_date': None,
3464 'title': video_title,
3466 'thumbnail': video_thumbnail,
3467 'description': None,
3471 class GooglePlusIE(InfoExtractor):
3472 """Information extractor for plus.google.com."""
3474 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3475 IE_NAME = u'plus.google'
3477 def __init__(self, downloader=None):
3478 InfoExtractor.__init__(self, downloader)
3480 def report_extract_entry(self, url):
3481 """Report downloading extry"""
3482 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3484 def report_date(self, upload_date):
3485 """Report downloading extry"""
3486 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3488 def report_uploader(self, uploader):
3489 """Report downloading extry"""
3490 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3492 def report_title(self, video_title):
3493 """Report downloading extry"""
3494 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3496 def report_extract_vid_page(self, video_page):
3497 """Report information extraction."""
3498 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3500 def _real_extract(self, url):
3501 # Extract id from URL
3502 mobj = re.match(self._VALID_URL, url)
3504 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3507 post_url = mobj.group(0)
3508 video_id = mobj.group(2)
3510 video_extension = 'flv'
3512 # Step 1, Retrieve post webpage to extract further information
3513 self.report_extract_entry(post_url)
3514 request = compat_urllib_request.Request(post_url)
3516 webpage = compat_urllib_request.urlopen(request).read()
3517 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3518 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3521 # Extract update date
3523 pattern = 'title="Timestamp">(.*?)</a>'
3524 mobj = re.search(pattern, webpage)
3526 upload_date = mobj.group(1)
3527 # Convert timestring to a format suitable for filename
3528 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3529 upload_date = upload_date.strftime('%Y%m%d')
3530 self.report_date(upload_date)
3534 pattern = r'rel\="author".*?>(.*?)</a>'
3535 mobj = re.search(pattern, webpage)
3537 uploader = mobj.group(1)
3538 self.report_uploader(uploader)
3541 # Get the first line for title
3543 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3544 mobj = re.search(pattern, webpage)
3546 video_title = mobj.group(1)
3547 self.report_title(video_title)
3549 # Step 2, Stimulate clicking the image box to launch video
3550 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3551 mobj = re.search(pattern, webpage)
3553 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3555 video_page = mobj.group(1)
3556 request = compat_urllib_request.Request(video_page)
3558 webpage = compat_urllib_request.urlopen(request).read()
3559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3560 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3562 self.report_extract_vid_page(video_page)
3565 # Extract video links on video page
3566 """Extract video links of all sizes"""
3567 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3568 mobj = re.findall(pattern, webpage)
3570 self._downloader.trouble(u'ERROR: unable to extract video links')
3572 # Sort in resolution
3573 links = sorted(mobj)
3575 # Choose the lowest of the sort, i.e. highest resolution
3576 video_url = links[-1]
3577 # Only get the url. The resolution part in the tuple has no use anymore
3578 video_url = video_url[-1]
3579 # Treat escaped \u0026 style hex
3580 video_url = unicode(video_url, "unicode_escape")
3584 'id': video_id.decode('utf-8'),
3586 'uploader': uploader.decode('utf-8'),
3587 'upload_date': upload_date.decode('utf-8'),
3588 'title': video_title.decode('utf-8'),
3589 'ext': video_extension.decode('utf-8'),
3592 class NBAIE(InfoExtractor):
3593 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3596 def report_extraction(self, video_id):
3597 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3599 def _real_extract(self, url):
3600 mobj = re.match(self._VALID_URL, url)
3602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3605 video_id = mobj.group(1)
3606 if video_id.endswith('/index.html'):
3607 video_id = video_id[:-len('/index.html')]
3609 self.report_extraction(video_id)
3611 urlh = compat_urllib_request.urlopen(url)
3612 webpage_bytes = urlh.read()
3613 webpage = webpage_bytes.decode('utf-8', 'ignore')
3614 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3615 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3618 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3619 def _findProp(rexp, default=None):
3620 m = re.search(rexp, webpage)
3622 return unescapeHTML(m.group(1))
3626 shortened_video_id = video_id.rpartition('/')[2]
3627 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3629 'id': shortened_video_id,
3633 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3634 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3638 class JustinTVIE(InfoExtractor):
3639 """Information extractor for justin.tv and twitch.tv"""
3640 # TODO: One broadcast may be split into multiple videos. The key
3641 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3642 # starts at 1 and increases. Can we treat all parts as one video?
3644 # _VALID_URL = r"""^(?:http(?:s?)://)?www\.(?:justin|twitch)\.tv/
3645 # ([^/]+)(?:/b/([^/]+))?/?(?:#.*)?$"""
3646 _VALID_URL = r'^http://www.twitch.tv/(.*)$'
3647 IE_NAME = u'justin.tv'
3649 _max_justin_results = 1000
3650 _justin_page_limit = 100
3652 def report_extraction(self, file_id):
3653 """Report information extraction."""
3654 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3656 # Return count of items, list of *valid* items
3657 def _parse_page(self, url):
3660 urlh = compat_urllib_request.urlopen(url)
3661 webpage_bytes = urlh.read()
3662 webpage = webpage_bytes.decode('utf-8', 'ignore')
3663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3664 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3667 response = json.loads(webpage)
3669 for clip in response:
3670 video_url = clip['video_file_url']
3672 video_extension = os.path.splitext(video_url)[1][1:]
3673 video_date = re.sub('-', '', clip['created_on'][:10])
3677 'title': clip['title'],
3678 'uploader': clip['user_id'] or clip['channel_id'],
3679 'upload_date': video_date,
3680 'ext': video_extension,
3683 return (len(response), info)
3685 def _real_extract(self, url):
3686 mobj = re.match(self._VALID_URL, url)
3688 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3691 api = 'http://api.justin.tv'
3692 video_id = mobj.group(mobj.lastindex)
3694 if mobj.lastindex == 1:
3696 api += '/channel/archives/%s.json'
3698 api += '/clip/show/%s.json'
3699 api = api % (video_id,)
3701 self.report_extraction(video_id)
3705 limit = self._justin_page_limit
3706 while offset < self._max_justin_results:
3707 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3708 page_count, page_info = self._parse_page(page_url)
3709 info.extend(page_info)
3710 if not paged or page_count != limit: