2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
70 self.set_downloader(downloader)
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _print_formats(self, formats):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
227 def _real_initialize(self):
228 if self._downloader is None:
233 downloader_params = self._downloader.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
252 request = compat_urllib_request.Request(self._LANG_URL)
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
275 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
286 'action_confirm': 'Confirm',
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
307 video_id = mobj.group(2)
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
354 # Start extracting information
355 self.report_information_extraction(video_id)
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
393 video_description = ''
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
402 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
419 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
425 except Trouble as trouble:
426 self._downloader.trouble(str(trouble))
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
475 video_url_list = [(rf, url_map[rf])]
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
485 for format_param, video_real_url in video_url_list:
487 video_extension = self._video_extensions.get(format_param, 'flv')
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
549 'submit': "Continue - I'm over 18",
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
566 video_id = mobj.group(1)
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
616 self._downloader.trouble(u'ERROR: unable to extract title')
618 video_title = mobj.group(1).decode('utf-8')
620 mobj = re.search(r'submitter=(.*?);', webpage)
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
624 video_uploader = mobj.group(1)
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
662 video_extension = 'mp4'
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
668 self.report_download_webpage(video_id)
669 webpage_bytes = compat_urllib_request.urlopen(request).read()
670 webpage = webpage_bytes.decode('utf-8')
671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
675 # Extract URL, uploader and title from webpage
676 self.report_extraction(video_id)
677 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 flashvars = compat_urllib_parse.unquote(mobj.group(1))
683 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
686 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
692 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
694 self._downloader.trouble(u'ERROR: unable to extract video URL')
697 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
699 # TODO: support choosing qualities
701 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
703 self._downloader.trouble(u'ERROR: unable to extract title')
705 video_title = unescapeHTML(mobj.group('title'))
707 video_uploader = None
708 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
710 # lookin for official user
711 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712 if mobj_official is None:
713 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
715 video_uploader = mobj_official.group(1)
717 video_uploader = mobj.group(1)
719 video_upload_date = None
720 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
722 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
727 'uploader': video_uploader,
728 'upload_date': video_upload_date,
729 'title': video_title,
730 'ext': video_extension,
734 class PhotobucketIE(InfoExtractor):
735 """Information extractor for photobucket.com."""
737 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
738 IE_NAME = u'photobucket'
740 def __init__(self, downloader=None):
741 InfoExtractor.__init__(self, downloader)
743 def report_download_webpage(self, video_id):
744 """Report webpage download."""
745 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
747 def report_extraction(self, video_id):
748 """Report information extraction."""
749 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
751 def _real_extract(self, url):
752 # Extract id from URL
753 mobj = re.match(self._VALID_URL, url)
755 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
758 video_id = mobj.group(1)
760 video_extension = 'flv'
762 # Retrieve video webpage to extract further information
763 request = compat_urllib_request.Request(url)
765 self.report_download_webpage(video_id)
766 webpage = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
771 # Extract URL, uploader, and title from webpage
772 self.report_extraction(video_id)
773 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
775 self._downloader.trouble(u'ERROR: unable to extract media URL')
777 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
783 self._downloader.trouble(u'ERROR: unable to extract title')
785 video_title = mobj.group(1).decode('utf-8')
787 video_uploader = mobj.group(2).decode('utf-8')
790 'id': video_id.decode('utf-8'),
791 'url': video_url.decode('utf-8'),
792 'uploader': video_uploader,
794 'title': video_title,
795 'ext': video_extension.decode('utf-8'),
799 class YahooIE(InfoExtractor):
800 """Information extractor for video.yahoo.com."""
802 # _VALID_URL matches all Yahoo! Video URLs
803 # _VPAGE_URL matches only the extractable '/watch/' URLs
804 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
805 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
806 IE_NAME = u'video.yahoo'
808 def __init__(self, downloader=None):
809 InfoExtractor.__init__(self, downloader)
811 def report_download_webpage(self, video_id):
812 """Report webpage download."""
813 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
815 def report_extraction(self, video_id):
816 """Report information extraction."""
817 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
819 def _real_extract(self, url, new_video=True):
820 # Extract ID from URL
821 mobj = re.match(self._VALID_URL, url)
823 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
826 video_id = mobj.group(2)
827 video_extension = 'flv'
829 # Rewrite valid but non-extractable URLs as
830 # extractable English language /watch/ URLs
831 if re.match(self._VPAGE_URL, url) is None:
832 request = compat_urllib_request.Request(url)
834 webpage = compat_urllib_request.urlopen(request).read()
835 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
836 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
839 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
841 self._downloader.trouble(u'ERROR: Unable to extract id field')
843 yahoo_id = mobj.group(1)
845 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
847 self._downloader.trouble(u'ERROR: Unable to extract vid field')
849 yahoo_vid = mobj.group(1)
851 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
852 return self._real_extract(url, new_video=False)
854 # Retrieve video webpage to extract further information
855 request = compat_urllib_request.Request(url)
857 self.report_download_webpage(video_id)
858 webpage = compat_urllib_request.urlopen(request).read()
859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
860 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
863 # Extract uploader and title from webpage
864 self.report_extraction(video_id)
865 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
867 self._downloader.trouble(u'ERROR: unable to extract video title')
869 video_title = mobj.group(1).decode('utf-8')
871 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
873 self._downloader.trouble(u'ERROR: unable to extract video uploader')
875 video_uploader = mobj.group(1).decode('utf-8')
877 # Extract video thumbnail
878 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
880 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
882 video_thumbnail = mobj.group(1).decode('utf-8')
884 # Extract video description
885 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
887 self._downloader.trouble(u'ERROR: unable to extract video description')
889 video_description = mobj.group(1).decode('utf-8')
890 if not video_description:
891 video_description = 'No description available.'
893 # Extract video height and width
894 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
896 self._downloader.trouble(u'ERROR: unable to extract video height')
898 yv_video_height = mobj.group(1)
900 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
902 self._downloader.trouble(u'ERROR: unable to extract video width')
904 yv_video_width = mobj.group(1)
906 # Retrieve video playlist to extract media URL
907 # I'm not completely sure what all these options are, but we
908 # seem to need most of them, otherwise the server sends a 401.
909 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
910 yv_bitrate = '700' # according to Wikipedia this is hard-coded
911 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
912 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
913 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
915 self.report_download_webpage(video_id)
916 webpage = compat_urllib_request.urlopen(request).read()
917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
921 # Extract media URL from playlist XML
922 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
924 self._downloader.trouble(u'ERROR: Unable to extract media URL')
926 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
927 video_url = unescapeHTML(video_url)
930 'id': video_id.decode('utf-8'),
932 'uploader': video_uploader,
934 'title': video_title,
935 'ext': video_extension.decode('utf-8'),
936 'thumbnail': video_thumbnail.decode('utf-8'),
937 'description': video_description,
941 class VimeoIE(InfoExtractor):
942 """Information extractor for vimeo.com."""
944 # _VALID_URL matches Vimeo URLs
945 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
948 def __init__(self, downloader=None):
949 InfoExtractor.__init__(self, downloader)
951 def report_download_webpage(self, video_id):
952 """Report webpage download."""
953 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
955 def report_extraction(self, video_id):
956 """Report information extraction."""
957 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
959 def _real_extract(self, url, new_video=True):
960 # Extract ID from URL
961 mobj = re.match(self._VALID_URL, url)
963 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
966 video_id = mobj.group(1)
968 # Retrieve video webpage to extract further information
969 request = compat_urllib_request.Request(url, None, std_headers)
971 self.report_download_webpage(video_id)
972 webpage_bytes = compat_urllib_request.urlopen(request).read()
973 webpage = webpage_bytes.decode('utf-8')
974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
978 # Now we begin extracting as much information as we can from what we
979 # retrieved. First we extract the information common to all extractors,
980 # and latter we extract those that are Vimeo specific.
981 self.report_extraction(video_id)
983 # Extract the config JSON
985 config = webpage.split(' = {config:')[1].split(',assets:')[0]
986 config = json.loads(config)
988 self._downloader.trouble(u'ERROR: unable to extract info section')
992 video_title = config["video"]["title"]
995 video_uploader = config["video"]["owner"]["name"]
997 # Extract video thumbnail
998 video_thumbnail = config["video"]["thumbnail"]
1000 # Extract video description
1001 video_description = get_element_by_id("description", webpage)
1002 if video_description: video_description = clean_html(video_description)
1003 else: video_description = ''
1005 # Extract upload date
1006 video_upload_date = None
1007 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1008 if mobj is not None:
1009 video_upload_date = mobj.group(1)
1011 # Vimeo specific: extract request signature and timestamp
1012 sig = config['request']['signature']
1013 timestamp = config['request']['timestamp']
1015 # Vimeo specific: extract video codec and quality information
1016 # First consider quality, then codecs, then take everything
1017 # TODO bind to format param
1018 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1019 files = { 'hd': [], 'sd': [], 'other': []}
1020 for codec_name, codec_extension in codecs:
1021 if codec_name in config["video"]["files"]:
1022 if 'hd' in config["video"]["files"][codec_name]:
1023 files['hd'].append((codec_name, codec_extension, 'hd'))
1024 elif 'sd' in config["video"]["files"][codec_name]:
1025 files['sd'].append((codec_name, codec_extension, 'sd'))
1027 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1029 for quality in ('hd', 'sd', 'other'):
1030 if len(files[quality]) > 0:
1031 video_quality = files[quality][0][2]
1032 video_codec = files[quality][0][0]
1033 video_extension = files[quality][0][1]
1034 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1037 self._downloader.trouble(u'ERROR: no known codec found')
1040 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1041 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1046 'uploader': video_uploader,
1047 'upload_date': video_upload_date,
1048 'title': video_title,
1049 'ext': video_extension,
1050 'thumbnail': video_thumbnail,
1051 'description': video_description,
1055 class ArteTvIE(InfoExtractor):
1056 """arte.tv information extractor."""
1058 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1059 _LIVE_URL = r'index-[0-9]+\.html$'
1061 IE_NAME = u'arte.tv'
1063 def __init__(self, downloader=None):
1064 InfoExtractor.__init__(self, downloader)
1066 def report_download_webpage(self, video_id):
1067 """Report webpage download."""
1068 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1070 def report_extraction(self, video_id):
1071 """Report information extraction."""
1072 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1074 def fetch_webpage(self, url):
1075 self._downloader.increment_downloads()
1076 request = compat_urllib_request.Request(url)
1078 self.report_download_webpage(url)
1079 webpage = compat_urllib_request.urlopen(request).read()
1080 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1081 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1083 except ValueError as err:
1084 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1088 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1089 page = self.fetch_webpage(url)
1090 mobj = re.search(regex, page, regexFlags)
1094 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1097 for (i, key, err) in matchTuples:
1098 if mobj.group(i) is None:
1099 self._downloader.trouble(err)
1102 info[key] = mobj.group(i)
1106 def extractLiveStream(self, url):
1107 video_lang = url.split('/')[-4]
1108 info = self.grep_webpage(
1110 r'src="(.*?/videothek_js.*?\.js)',
1113 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1116 http_host = url.split('/')[2]
1117 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1118 info = self.grep_webpage(
1120 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1121 '(http://.*?\.swf).*?' +
1125 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1126 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1127 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1130 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1132 def extractPlus7Stream(self, url):
1133 video_lang = url.split('/')[-3]
1134 info = self.grep_webpage(
1136 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1139 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1142 next_url = compat_urllib_parse.unquote(info.get('url'))
1143 info = self.grep_webpage(
1145 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1148 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1151 next_url = compat_urllib_parse.unquote(info.get('url'))
1153 info = self.grep_webpage(
1155 r'<video id="(.*?)".*?>.*?' +
1156 '<name>(.*?)</name>.*?' +
1157 '<dateVideo>(.*?)</dateVideo>.*?' +
1158 '<url quality="hd">(.*?)</url>',
1161 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1162 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1163 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1164 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1169 'id': info.get('id'),
1170 'url': compat_urllib_parse.unquote(info.get('url')),
1171 'uploader': u'arte.tv',
1172 'upload_date': info.get('date'),
1173 'title': info.get('title'),
1179 def _real_extract(self, url):
1180 video_id = url.split('/')[-1]
1181 self.report_extraction(video_id)
1183 if re.search(self._LIVE_URL, video_id) is not None:
1184 self.extractLiveStream(url)
1187 info = self.extractPlus7Stream(url)
1192 class GenericIE(InfoExtractor):
1193 """Generic last-resort information extractor."""
1196 IE_NAME = u'generic'
1198 def __init__(self, downloader=None):
1199 InfoExtractor.__init__(self, downloader)
1201 def report_download_webpage(self, video_id):
1202 """Report webpage download."""
1203 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1204 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1206 def report_extraction(self, video_id):
1207 """Report information extraction."""
1208 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1210 def report_following_redirect(self, new_url):
1211 """Report information extraction."""
1212 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1214 def _test_redirect(self, url):
1215 """Check if it is a redirect, like url shorteners, in case restart chain."""
1216 class HeadRequest(compat_urllib_request.Request):
1217 def get_method(self):
1220 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1222 Subclass the HTTPRedirectHandler to make it use our
1223 HeadRequest also on the redirected URL
1225 def redirect_request(self, req, fp, code, msg, headers, newurl):
1226 if code in (301, 302, 303, 307):
1227 newurl = newurl.replace(' ', '%20')
1228 newheaders = dict((k,v) for k,v in req.headers.items()
1229 if k.lower() not in ("content-length", "content-type"))
1230 return HeadRequest(newurl,
1232 origin_req_host=req.get_origin_req_host(),
1235 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1237 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1239 Fallback to GET if HEAD is not allowed (405 HTTP error)
1241 def http_error_405(self, req, fp, code, msg, headers):
1245 newheaders = dict((k,v) for k,v in req.headers.items()
1246 if k.lower() not in ("content-length", "content-type"))
1247 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1249 origin_req_host=req.get_origin_req_host(),
1253 opener = compat_urllib_request.OpenerDirector()
1254 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1255 HTTPMethodFallback, HEADRedirectHandler,
1256 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1257 opener.add_handler(handler())
1259 response = opener.open(HeadRequest(url))
1260 new_url = response.geturl()
1265 self.report_following_redirect(new_url)
1266 self._downloader.download([new_url])
1269 def _real_extract(self, url):
1270 if self._test_redirect(url): return
1272 video_id = url.split('/')[-1]
1273 request = compat_urllib_request.Request(url)
1275 self.report_download_webpage(video_id)
1276 webpage = compat_urllib_request.urlopen(request).read()
1277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1278 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1280 except ValueError as err:
1281 # since this is the last-resort InfoExtractor, if
1282 # this error is thrown, it'll be thrown here
1283 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1286 self.report_extraction(video_id)
1287 # Start with something easy: JW Player in SWFObject
1288 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1290 # Broaden the search a little bit
1291 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1293 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1296 # It's possible that one of the regexes
1297 # matched, but returned an empty group:
1298 if mobj.group(1) is None:
1299 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1302 video_url = compat_urllib_parse.unquote(mobj.group(1))
1303 video_id = os.path.basename(video_url)
1305 # here's a fun little line of code for you:
1306 video_extension = os.path.splitext(video_id)[1][1:]
1307 video_id = os.path.splitext(video_id)[0]
1309 # it's tempting to parse this further, but you would
1310 # have to take into account all the variations like
1311 # Video Title - Site Name
1312 # Site Name | Video Title
1313 # Video Title - Tagline | Site Name
1314 # and so on and so forth; it's just not practical
1315 mobj = re.search(r'<title>(.*)</title>', webpage)
1317 self._downloader.trouble(u'ERROR: unable to extract title')
1319 video_title = mobj.group(1)
1321 # video uploader is domain name
1322 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1324 self._downloader.trouble(u'ERROR: unable to extract title')
1326 video_uploader = mobj.group(1)
1331 'uploader': video_uploader,
1332 'upload_date': None,
1333 'title': video_title,
1334 'ext': video_extension,
1338 class YoutubeSearchIE(InfoExtractor):
1339 """Information Extractor for YouTube search queries."""
1340 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1341 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1342 _max_youtube_results = 1000
1343 IE_NAME = u'youtube:search'
1345 def __init__(self, downloader=None):
1346 InfoExtractor.__init__(self, downloader)
1348 def report_download_page(self, query, pagenum):
1349 """Report attempt to download search page with given number."""
1350 query = query.decode(preferredencoding())
1351 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1353 def _real_extract(self, query):
1354 mobj = re.match(self._VALID_URL, query)
1356 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1359 prefix, query = query.split(':')
1361 query = query.encode('utf-8')
1363 self._download_n_results(query, 1)
1365 elif prefix == 'all':
1366 self._download_n_results(query, self._max_youtube_results)
1372 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1374 elif n > self._max_youtube_results:
1375 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1376 n = self._max_youtube_results
1377 self._download_n_results(query, n)
1379 except ValueError: # parsing prefix as integer fails
1380 self._download_n_results(query, 1)
1383 def _download_n_results(self, query, n):
1384 """Downloads a specified number of results for a query"""
1390 while (50 * pagenum) < limit:
1391 self.report_download_page(query, pagenum+1)
1392 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1393 request = compat_urllib_request.Request(result_url)
1395 data = compat_urllib_request.urlopen(request).read()
1396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1397 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1399 api_response = json.loads(data)['data']
1401 new_ids = list(video['id'] for video in api_response['items'])
1402 video_ids += new_ids
1404 limit = min(n, api_response['totalItems'])
1407 if len(video_ids) > n:
1408 video_ids = video_ids[:n]
1409 for id in video_ids:
1410 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1414 class GoogleSearchIE(InfoExtractor):
1415 """Information Extractor for Google Video search queries."""
1416 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1417 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1418 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1419 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1420 _max_google_results = 1000
1421 IE_NAME = u'video.google:search'
1423 def __init__(self, downloader=None):
1424 InfoExtractor.__init__(self, downloader)
1426 def report_download_page(self, query, pagenum):
1427 """Report attempt to download playlist page with given number."""
1428 query = query.decode(preferredencoding())
1429 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1431 def _real_extract(self, query):
1432 mobj = re.match(self._VALID_URL, query)
1434 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1437 prefix, query = query.split(':')
1439 query = query.encode('utf-8')
1441 self._download_n_results(query, 1)
1443 elif prefix == 'all':
1444 self._download_n_results(query, self._max_google_results)
1450 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1452 elif n > self._max_google_results:
1453 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1454 n = self._max_google_results
1455 self._download_n_results(query, n)
1457 except ValueError: # parsing prefix as integer fails
1458 self._download_n_results(query, 1)
1461 def _download_n_results(self, query, n):
1462 """Downloads a specified number of results for a query"""
1468 self.report_download_page(query, pagenum)
1469 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1470 request = compat_urllib_request.Request(result_url)
1472 page = compat_urllib_request.urlopen(request).read()
1473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1474 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1477 # Extract video identifiers
1478 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1479 video_id = mobj.group(1)
1480 if video_id not in video_ids:
1481 video_ids.append(video_id)
1482 if len(video_ids) == n:
1483 # Specified n videos reached
1484 for id in video_ids:
1485 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1489 for id in video_ids:
1490 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1493 pagenum = pagenum + 1
1496 class YahooSearchIE(InfoExtractor):
1497 """Information Extractor for Yahoo! Video search queries."""
1498 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1499 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1500 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1501 _MORE_PAGES_INDICATOR = r'\s*Next'
1502 _max_yahoo_results = 1000
1503 IE_NAME = u'video.yahoo:search'
1505 def __init__(self, downloader=None):
1506 InfoExtractor.__init__(self, downloader)
1508 def report_download_page(self, query, pagenum):
1509 """Report attempt to download playlist page with given number."""
1510 query = query.decode(preferredencoding())
1511 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1513 def _real_extract(self, query):
1514 mobj = re.match(self._VALID_URL, query)
1516 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1519 prefix, query = query.split(':')
1521 query = query.encode('utf-8')
1523 self._download_n_results(query, 1)
1525 elif prefix == 'all':
1526 self._download_n_results(query, self._max_yahoo_results)
1532 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1534 elif n > self._max_yahoo_results:
1535 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1536 n = self._max_yahoo_results
1537 self._download_n_results(query, n)
1539 except ValueError: # parsing prefix as integer fails
1540 self._download_n_results(query, 1)
1543 def _download_n_results(self, query, n):
1544 """Downloads a specified number of results for a query"""
1547 already_seen = set()
1551 self.report_download_page(query, pagenum)
1552 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1553 request = compat_urllib_request.Request(result_url)
1555 page = compat_urllib_request.urlopen(request).read()
1556 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1557 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1560 # Extract video identifiers
1561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1562 video_id = mobj.group(1)
1563 if video_id not in already_seen:
1564 video_ids.append(video_id)
1565 already_seen.add(video_id)
1566 if len(video_ids) == n:
1567 # Specified n videos reached
1568 for id in video_ids:
1569 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573 for id in video_ids:
1574 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1577 pagenum = pagenum + 1
1580 class YoutubePlaylistIE(InfoExtractor):
1581 """Information Extractor for YouTube playlists."""
1583 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1584 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1585 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1586 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1587 IE_NAME = u'youtube:playlist'
1589 def __init__(self, downloader=None):
1590 InfoExtractor.__init__(self, downloader)
1592 def report_download_page(self, playlist_id, pagenum):
1593 """Report attempt to download playlist page with given number."""
1594 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1596 def _real_extract(self, url):
1597 # Extract playlist id
1598 mobj = re.match(self._VALID_URL, url)
1600 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1604 if mobj.group(3) is not None:
1605 self._downloader.download([mobj.group(3)])
1608 # Download playlist pages
1609 # prefix is 'p' as default for playlists but there are other types that need extra care
1610 playlist_prefix = mobj.group(1)
1611 if playlist_prefix == 'a':
1612 playlist_access = 'artist'
1614 playlist_prefix = 'p'
1615 playlist_access = 'view_play_list'
1616 playlist_id = mobj.group(2)
1621 self.report_download_page(playlist_id, pagenum)
1622 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1623 request = compat_urllib_request.Request(url)
1625 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1627 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1630 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1633 if mobj.group(1) not in ids_in_page:
1634 ids_in_page.append(mobj.group(1))
1635 video_ids.extend(ids_in_page)
1637 if self._MORE_PAGES_INDICATOR not in page:
1639 pagenum = pagenum + 1
1641 total = len(video_ids)
1643 playliststart = self._downloader.params.get('playliststart', 1) - 1
1644 playlistend = self._downloader.params.get('playlistend', -1)
1645 if playlistend == -1:
1646 video_ids = video_ids[playliststart:]
1648 video_ids = video_ids[playliststart:playlistend]
1650 if len(video_ids) == total:
1651 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1653 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1655 for id in video_ids:
1656 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1660 class YoutubeChannelIE(InfoExtractor):
1661 """Information Extractor for YouTube channels."""
1663 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1664 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1665 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1666 IE_NAME = u'youtube:channel'
1668 def report_download_page(self, channel_id, pagenum):
1669 """Report attempt to download channel page with given number."""
1670 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1672 def _real_extract(self, url):
1673 # Extract channel id
1674 mobj = re.match(self._VALID_URL, url)
1676 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1679 # Download channel pages
1680 channel_id = mobj.group(1)
1685 self.report_download_page(channel_id, pagenum)
1686 url = self._TEMPLATE_URL % (channel_id, pagenum)
1687 request = compat_urllib_request.Request(url)
1689 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1690 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1691 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1694 # Extract video identifiers
1696 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1697 if mobj.group(1) not in ids_in_page:
1698 ids_in_page.append(mobj.group(1))
1699 video_ids.extend(ids_in_page)
1701 if self._MORE_PAGES_INDICATOR not in page:
1703 pagenum = pagenum + 1
1705 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1707 for id in video_ids:
1708 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1712 class YoutubeUserIE(InfoExtractor):
1713 """Information Extractor for YouTube users."""
1715 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1716 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1717 _GDATA_PAGE_SIZE = 50
1718 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1719 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1720 IE_NAME = u'youtube:user'
1722 def __init__(self, downloader=None):
1723 InfoExtractor.__init__(self, downloader)
1725 def report_download_page(self, username, start_index):
1726 """Report attempt to download user page."""
1727 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1728 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1730 def _real_extract(self, url):
1732 mobj = re.match(self._VALID_URL, url)
1734 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1737 username = mobj.group(1)
1739 # Download video ids using YouTube Data API. Result size per
1740 # query is limited (currently to 50 videos) so we need to query
1741 # page by page until there are no video ids - it means we got
1748 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1749 self.report_download_page(username, start_index)
1751 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1754 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1755 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1756 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1759 # Extract video identifiers
1762 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1763 if mobj.group(1) not in ids_in_page:
1764 ids_in_page.append(mobj.group(1))
1766 video_ids.extend(ids_in_page)
1768 # A little optimization - if current page is not
1769 # "full", ie. does not contain PAGE_SIZE video ids then
1770 # we can assume that this page is the last one - there
1771 # are no more ids on further pages - no need to query
1774 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1779 all_ids_count = len(video_ids)
1780 playliststart = self._downloader.params.get('playliststart', 1) - 1
1781 playlistend = self._downloader.params.get('playlistend', -1)
1783 if playlistend == -1:
1784 video_ids = video_ids[playliststart:]
1786 video_ids = video_ids[playliststart:playlistend]
1788 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1789 (username, all_ids_count, len(video_ids)))
1791 for video_id in video_ids:
1792 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1795 class BlipTVUserIE(InfoExtractor):
1796 """Information Extractor for blip.tv users."""
1798 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1800 IE_NAME = u'blip.tv:user'
1802 def __init__(self, downloader=None):
1803 InfoExtractor.__init__(self, downloader)
1805 def report_download_page(self, username, pagenum):
1806 """Report attempt to download user page."""
1807 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1808 (self.IE_NAME, username, pagenum))
1810 def _real_extract(self, url):
1812 mobj = re.match(self._VALID_URL, url)
1814 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1817 username = mobj.group(1)
1819 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1821 request = compat_urllib_request.Request(url)
1824 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1825 mobj = re.search(r'data-users-id="([^"]+)"', page)
1826 page_base = page_base % mobj.group(1)
1827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1828 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1832 # Download video ids using BlipTV Ajax calls. Result size per
1833 # query is limited (currently to 12 videos) so we need to query
1834 # page by page until there are no video ids - it means we got
1841 self.report_download_page(username, pagenum)
1843 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1846 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1851 # Extract video identifiers
1854 for mobj in re.finditer(r'href="/([^"]+)"', page):
1855 if mobj.group(1) not in ids_in_page:
1856 ids_in_page.append(unescapeHTML(mobj.group(1)))
1858 video_ids.extend(ids_in_page)
1860 # A little optimization - if current page is not
1861 # "full", ie. does not contain PAGE_SIZE video ids then
1862 # we can assume that this page is the last one - there
1863 # are no more ids on further pages - no need to query
1866 if len(ids_in_page) < self._PAGE_SIZE:
1871 all_ids_count = len(video_ids)
1872 playliststart = self._downloader.params.get('playliststart', 1) - 1
1873 playlistend = self._downloader.params.get('playlistend', -1)
1875 if playlistend == -1:
1876 video_ids = video_ids[playliststart:]
1878 video_ids = video_ids[playliststart:playlistend]
1880 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1881 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1883 for video_id in video_ids:
1884 self._downloader.download([u'http://blip.tv/'+video_id])
1887 class DepositFilesIE(InfoExtractor):
1888 """Information extractor for depositfiles.com"""
1890 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1891 IE_NAME = u'DepositFiles'
1893 def __init__(self, downloader=None):
1894 InfoExtractor.__init__(self, downloader)
1896 def report_download_webpage(self, file_id):
1897 """Report webpage download."""
1898 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1900 def report_extraction(self, file_id):
1901 """Report information extraction."""
1902 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1904 def _real_extract(self, url):
1905 file_id = url.split('/')[-1]
1906 # Rebuild url in english locale
1907 url = 'http://depositfiles.com/en/files/' + file_id
1909 # Retrieve file webpage with 'Free download' button pressed
1910 free_download_indication = { 'gateway_result' : '1' }
1911 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1913 self.report_download_webpage(file_id)
1914 webpage = compat_urllib_request.urlopen(request).read()
1915 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1916 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1919 # Search for the real file URL
1920 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1921 if (mobj is None) or (mobj.group(1) is None):
1922 # Try to figure out reason of the error.
1923 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1924 if (mobj is not None) and (mobj.group(1) is not None):
1925 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1926 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1928 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1931 file_url = mobj.group(1)
1932 file_extension = os.path.splitext(file_url)[1][1:]
1934 # Search for file title
1935 mobj = re.search(r'<b title="(.*?)">', webpage)
1937 self._downloader.trouble(u'ERROR: unable to extract title')
1939 file_title = mobj.group(1).decode('utf-8')
1942 'id': file_id.decode('utf-8'),
1943 'url': file_url.decode('utf-8'),
1945 'upload_date': None,
1946 'title': file_title,
1947 'ext': file_extension.decode('utf-8'),
1951 class FacebookIE(InfoExtractor):
1952 """Information Extractor for Facebook"""
1955 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1956 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1957 _NETRC_MACHINE = 'facebook'
1958 _available_formats = ['video', 'highqual', 'lowqual']
1959 _video_extensions = {
1964 IE_NAME = u'facebook'
1966 def __init__(self, downloader=None):
1967 InfoExtractor.__init__(self, downloader)
1969 def _reporter(self, message):
1970 """Add header and report message."""
1971 self._downloader.to_screen(u'[facebook] %s' % message)
1973 def report_login(self):
1974 """Report attempt to log in."""
1975 self._reporter(u'Logging in')
1977 def report_video_webpage_download(self, video_id):
1978 """Report attempt to download video webpage."""
1979 self._reporter(u'%s: Downloading video webpage' % video_id)
1981 def report_information_extraction(self, video_id):
1982 """Report attempt to extract video information."""
1983 self._reporter(u'%s: Extracting video information' % video_id)
1985 def _parse_page(self, video_webpage):
1986 """Extract video information from page"""
1988 data = {'title': r'\("video_title", "(.*?)"\)',
1989 'description': r'<div class="datawrap">(.*?)</div>',
1990 'owner': r'\("video_owner_name", "(.*?)"\)',
1991 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1994 for piece in data.keys():
1995 mobj = re.search(data[piece], video_webpage)
1996 if mobj is not None:
1997 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2001 for fmt in self._available_formats:
2002 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2003 if mobj is not None:
2004 # URL is in a Javascript segment inside an escaped Unicode format within
2005 # the generally utf-8 page
2006 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2007 video_info['video_urls'] = video_urls
2011 def _real_initialize(self):
2012 if self._downloader is None:
2017 downloader_params = self._downloader.params
2019 # Attempt to use provided username and password or .netrc data
2020 if downloader_params.get('username', None) is not None:
2021 useremail = downloader_params['username']
2022 password = downloader_params['password']
2023 elif downloader_params.get('usenetrc', False):
2025 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2026 if info is not None:
2030 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2031 except (IOError, netrc.NetrcParseError) as err:
2032 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2035 if useremail is None:
2044 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2047 login_results = compat_urllib_request.urlopen(request).read()
2048 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2049 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2051 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2052 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2055 def _real_extract(self, url):
2056 mobj = re.match(self._VALID_URL, url)
2058 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2060 video_id = mobj.group('ID')
2063 self.report_video_webpage_download(video_id)
2064 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2066 page = compat_urllib_request.urlopen(request)
2067 video_webpage = page.read()
2068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2072 # Start extracting information
2073 self.report_information_extraction(video_id)
2075 # Extract information
2076 video_info = self._parse_page(video_webpage)
2079 if 'owner' not in video_info:
2080 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2082 video_uploader = video_info['owner']
2085 if 'title' not in video_info:
2086 self._downloader.trouble(u'ERROR: unable to extract video title')
2088 video_title = video_info['title']
2089 video_title = video_title.decode('utf-8')
2092 if 'thumbnail' not in video_info:
2093 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2094 video_thumbnail = ''
2096 video_thumbnail = video_info['thumbnail']
2100 if 'upload_date' in video_info:
2101 upload_time = video_info['upload_date']
2102 timetuple = email.utils.parsedate_tz(upload_time)
2103 if timetuple is not None:
2105 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2110 video_description = video_info.get('description', 'No description available.')
2112 url_map = video_info['video_urls']
2113 if len(url_map.keys()) > 0:
2114 # Decide which formats to download
2115 req_format = self._downloader.params.get('format', None)
2116 format_limit = self._downloader.params.get('format_limit', None)
2118 if format_limit is not None and format_limit in self._available_formats:
2119 format_list = self._available_formats[self._available_formats.index(format_limit):]
2121 format_list = self._available_formats
2122 existing_formats = [x for x in format_list if x in url_map]
2123 if len(existing_formats) == 0:
2124 self._downloader.trouble(u'ERROR: no known formats available for video')
2126 if req_format is None:
2127 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2128 elif req_format == 'worst':
2129 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2130 elif req_format == '-1':
2131 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2134 if req_format not in url_map:
2135 self._downloader.trouble(u'ERROR: requested format not available')
2137 video_url_list = [(req_format, url_map[req_format])] # Specific format
2140 for format_param, video_real_url in video_url_list:
2142 video_extension = self._video_extensions.get(format_param, 'mp4')
2145 'id': video_id.decode('utf-8'),
2146 'url': video_real_url.decode('utf-8'),
2147 'uploader': video_uploader.decode('utf-8'),
2148 'upload_date': upload_date,
2149 'title': video_title,
2150 'ext': video_extension.decode('utf-8'),
2151 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2152 'thumbnail': video_thumbnail.decode('utf-8'),
2153 'description': video_description.decode('utf-8'),
2157 class BlipTVIE(InfoExtractor):
2158 """Information extractor for blip.tv"""
2160 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2161 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2162 IE_NAME = u'blip.tv'
2164 def report_extraction(self, file_id):
2165 """Report information extraction."""
2166 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2168 def report_direct_download(self, title):
2169 """Report information extraction."""
2170 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2172 def _real_extract(self, url):
2173 mobj = re.match(self._VALID_URL, url)
2175 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2182 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2183 request = compat_urllib_request.Request(json_url)
2184 self.report_extraction(mobj.group(1))
2187 urlh = compat_urllib_request.urlopen(request)
2188 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2189 basename = url.split('/')[-1]
2190 title,ext = os.path.splitext(basename)
2191 title = title.decode('UTF-8')
2192 ext = ext.replace('.', '')
2193 self.report_direct_download(title)
2198 'upload_date': None,
2203 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2204 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2206 if info is None: # Regular URL
2208 json_code_bytes = urlh.read()
2209 json_code = json_code_bytes.decode('utf-8')
2210 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2211 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2215 json_data = json.loads(json_code)
2216 if 'Post' in json_data:
2217 data = json_data['Post']
2221 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2222 video_url = data['media']['url']
2223 umobj = re.match(self._URL_EXT, video_url)
2225 raise ValueError('Can not determine filename extension')
2226 ext = umobj.group(1)
2229 'id': data['item_id'],
2231 'uploader': data['display_name'],
2232 'upload_date': upload_date,
2233 'title': data['title'],
2235 'format': data['media']['mimeType'],
2236 'thumbnail': data['thumbnailUrl'],
2237 'description': data['description'],
2238 'player_url': data['embedUrl']
2240 except (ValueError,KeyError) as err:
2241 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2244 std_headers['User-Agent'] = 'iTunes/10.6.1'
2248 class MyVideoIE(InfoExtractor):
2249 """Information Extractor for myvideo.de."""
2251 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2252 IE_NAME = u'myvideo'
2254 def __init__(self, downloader=None):
2255 InfoExtractor.__init__(self, downloader)
2257 def report_download_webpage(self, video_id):
2258 """Report webpage download."""
2259 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2261 def report_extraction(self, video_id):
2262 """Report information extraction."""
2263 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2265 def _real_extract(self,url):
2266 mobj = re.match(self._VALID_URL, url)
2268 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2271 video_id = mobj.group(1)
2274 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2276 self.report_download_webpage(video_id)
2277 webpage = compat_urllib_request.urlopen(request).read()
2278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2279 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2282 self.report_extraction(video_id)
2283 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2286 self._downloader.trouble(u'ERROR: unable to extract media URL')
2288 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2290 mobj = re.search('<title>([^<]+)</title>', webpage)
2292 self._downloader.trouble(u'ERROR: unable to extract title')
2295 video_title = mobj.group(1)
2301 'upload_date': None,
2302 'title': video_title,
2306 class ComedyCentralIE(InfoExtractor):
2307 """Information extractor for The Daily Show and Colbert Report """
2309 # urls can be abbreviations like :thedailyshow or :colbert
2310 # urls for episodes like:
2311 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2312 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2313 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2314 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2315 |(https?://)?(www\.)?
2316 (?P<showname>thedailyshow|colbertnation)\.com/
2317 (full-episodes/(?P<episode>.*)|
2319 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2320 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2322 IE_NAME = u'comedycentral'
2324 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2326 _video_extensions = {
2334 _video_dimensions = {
2343 def suitable(self, url):
2344 """Receives a URL and returns True if suitable for this IE."""
2345 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2347 def report_extraction(self, episode_id):
2348 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2350 def report_config_download(self, episode_id):
2351 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2353 def report_index_download(self, episode_id):
2354 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2356 def report_player_url(self, episode_id):
2357 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2360 def _print_formats(self, formats):
2361 print('Available formats:')
2363 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2366 def _real_extract(self, url):
2367 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2369 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2372 if mobj.group('shortname'):
2373 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2374 url = u'http://www.thedailyshow.com/full-episodes/'
2376 url = u'http://www.colbertnation.com/full-episodes/'
2377 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2378 assert mobj is not None
2380 if mobj.group('clip'):
2381 if mobj.group('showname') == 'thedailyshow':
2382 epTitle = mobj.group('tdstitle')
2384 epTitle = mobj.group('cntitle')
2387 dlNewest = not mobj.group('episode')
2389 epTitle = mobj.group('showname')
2391 epTitle = mobj.group('episode')
2393 req = compat_urllib_request.Request(url)
2394 self.report_extraction(epTitle)
2396 htmlHandle = compat_urllib_request.urlopen(req)
2397 html = htmlHandle.read()
2398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2399 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2402 url = htmlHandle.geturl()
2403 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2405 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2407 if mobj.group('episode') == '':
2408 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2410 epTitle = mobj.group('episode')
2412 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2414 if len(mMovieParams) == 0:
2415 # The Colbert Report embeds the information in a without
2416 # a URL prefix; so extract the alternate reference
2417 # and then add the URL prefix manually.
2419 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2420 if len(altMovieParams) == 0:
2421 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2424 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2426 playerUrl_raw = mMovieParams[0][0]
2427 self.report_player_url(epTitle)
2429 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2430 playerUrl = urlHandle.geturl()
2431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2432 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2435 uri = mMovieParams[0][1]
2436 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2437 self.report_index_download(epTitle)
2439 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2446 idoc = xml.etree.ElementTree.fromstring(indexXml)
2447 itemEls = idoc.findall('.//item')
2448 for itemEl in itemEls:
2449 mediaId = itemEl.findall('./guid')[0].text
2450 shortMediaId = mediaId.split(':')[-1]
2451 showId = mediaId.split(':')[-2].replace('.com', '')
2452 officialTitle = itemEl.findall('./title')[0].text
2453 officialDate = itemEl.findall('./pubDate')[0].text
2455 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2456 compat_urllib_parse.urlencode({'uri': mediaId}))
2457 configReq = compat_urllib_request.Request(configUrl)
2458 self.report_config_download(epTitle)
2460 configXml = compat_urllib_request.urlopen(configReq).read()
2461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2462 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2465 cdoc = xml.etree.ElementTree.fromstring(configXml)
2467 for rendition in cdoc.findall('.//rendition'):
2468 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2472 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2475 if self._downloader.params.get('listformats', None):
2476 self._print_formats([i[0] for i in turls])
2479 # For now, just pick the highest bitrate
2480 format,video_url = turls[-1]
2482 # Get the format arg from the arg stream
2483 req_format = self._downloader.params.get('format', None)
2485 # Select format if we can find one
2488 format, video_url = f, v
2491 # Patch to download from alternative CDN, which does not
2492 # break on current RTMPDump builds
2493 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2494 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2496 if video_url.startswith(broken_cdn):
2497 video_url = video_url.replace(broken_cdn, better_cdn)
2499 effTitle = showId + u'-' + epTitle
2504 'upload_date': officialDate,
2509 'description': officialTitle,
2510 'player_url': None #playerUrl
2513 results.append(info)
2518 class EscapistIE(InfoExtractor):
2519 """Information extractor for The Escapist """
2521 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2522 IE_NAME = u'escapist'
2524 def report_extraction(self, showName):
2525 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2527 def report_config_download(self, showName):
2528 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2530 def _real_extract(self, url):
2531 mobj = re.match(self._VALID_URL, url)
2533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2535 showName = mobj.group('showname')
2536 videoId = mobj.group('episode')
2538 self.report_extraction(showName)
2540 webPage = compat_urllib_request.urlopen(url)
2541 webPageBytes = webPage.read()
2542 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2543 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2545 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2548 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2549 description = unescapeHTML(descMatch.group(1))
2550 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2551 imgUrl = unescapeHTML(imgMatch.group(1))
2552 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2553 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2554 configUrlMatch = re.search('config=(.*)$', playerUrl)
2555 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2557 self.report_config_download(showName)
2559 configJSON = compat_urllib_request.urlopen(configUrl).read()
2560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2561 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2564 # Technically, it's JavaScript, not JSON
2565 configJSON = configJSON.replace("'", '"')
2568 config = json.loads(configJSON)
2569 except (ValueError,) as err:
2570 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2573 playlist = config['playlist']
2574 videoUrl = playlist[1]['url']
2579 'uploader': showName,
2580 'upload_date': None,
2583 'thumbnail': imgUrl,
2584 'description': description,
2585 'player_url': playerUrl,
2591 class CollegeHumorIE(InfoExtractor):
2592 """Information extractor for collegehumor.com"""
2595 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2596 IE_NAME = u'collegehumor'
2598 def report_manifest(self, video_id):
2599 """Report information extraction."""
2600 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2602 def report_extraction(self, video_id):
2603 """Report information extraction."""
2604 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2606 def _real_extract(self, url):
2607 mobj = re.match(self._VALID_URL, url)
2609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2611 video_id = mobj.group('videoid')
2616 'upload_date': None,
2619 self.report_extraction(video_id)
2620 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2622 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2623 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2624 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2627 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2629 videoNode = mdoc.findall('./video')[0]
2630 info['description'] = videoNode.findall('./description')[0].text
2631 info['title'] = videoNode.findall('./caption')[0].text
2632 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2633 manifest_url = videoNode.findall('./file')[0].text
2635 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2638 manifest_url += '?hdcore=2.10.3'
2639 self.report_manifest(video_id)
2641 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2646 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2648 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2649 node_id = media_node.attrib['url']
2650 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2651 except IndexError as err:
2652 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2655 url_pr = compat_urllib_parse_urlparse(manifest_url)
2656 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2663 class XVideosIE(InfoExtractor):
2664 """Information extractor for xvideos.com"""
2666 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2667 IE_NAME = u'xvideos'
2669 def report_webpage(self, video_id):
2670 """Report information extraction."""
2671 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2673 def report_extraction(self, video_id):
2674 """Report information extraction."""
2675 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2677 def _real_extract(self, url):
2678 mobj = re.match(self._VALID_URL, url)
2680 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2682 video_id = mobj.group(1)
2684 self.report_webpage(video_id)
2686 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2688 webpage_bytes = compat_urllib_request.urlopen(request).read()
2689 webpage = webpage_bytes.decode('utf-8', 'replace')
2690 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2694 self.report_extraction(video_id)
2698 mobj = re.search(r'flv_url=(.+?)&', webpage)
2700 self._downloader.trouble(u'ERROR: unable to extract video url')
2702 video_url = compat_urllib_parse.unquote(mobj.group(1))
2706 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2708 self._downloader.trouble(u'ERROR: unable to extract video title')
2710 video_title = mobj.group(1)
2713 # Extract video thumbnail
2714 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2716 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2718 video_thumbnail = mobj.group(0)
2724 'upload_date': None,
2725 'title': video_title,
2727 'thumbnail': video_thumbnail,
2728 'description': None,
2734 class SoundcloudIE(InfoExtractor):
2735 """Information extractor for soundcloud.com
2736 To access the media, the uid of the song and a stream token
2737 must be extracted from the page source and the script must make
2738 a request to media.soundcloud.com/crossdomain.xml. Then
2739 the media can be grabbed by requesting from an url composed
2740 of the stream token and uid
2743 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2744 IE_NAME = u'soundcloud'
2746 def __init__(self, downloader=None):
2747 InfoExtractor.__init__(self, downloader)
2749 def report_resolve(self, video_id):
2750 """Report information extraction."""
2751 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2753 def report_extraction(self, video_id):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2757 def _real_extract(self, url):
2758 mobj = re.match(self._VALID_URL, url)
2760 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2763 # extract uploader (which is in the url)
2764 uploader = mobj.group(1)
2765 # extract simple title (uploader + slug of song title)
2766 slug_title = mobj.group(2)
2767 simple_title = uploader + u'-' + slug_title
2769 self.report_resolve('%s/%s' % (uploader, slug_title))
2771 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2772 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2773 request = compat_urllib_request.Request(resolv_url)
2775 info_json_bytes = compat_urllib_request.urlopen(request).read()
2776 info_json = info_json_bytes.decode('utf-8')
2777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2778 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2781 info = json.loads(info_json)
2782 video_id = info['id']
2783 self.report_extraction('%s/%s' % (uploader, slug_title))
2785 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2786 request = compat_urllib_request.Request(streams_url)
2788 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2789 stream_json = stream_json_bytes.decode('utf-8')
2790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2791 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2794 streams = json.loads(stream_json)
2795 mediaURL = streams['http_mp3_128_url']
2800 'uploader': info['user']['username'],
2801 'upload_date': info['created_at'],
2802 'title': info['title'],
2804 'description': info['description'],
2808 class InfoQIE(InfoExtractor):
2809 """Information extractor for infoq.com"""
2811 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2814 def report_webpage(self, video_id):
2815 """Report information extraction."""
2816 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2818 def report_extraction(self, video_id):
2819 """Report information extraction."""
2820 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2822 def _real_extract(self, url):
2823 mobj = re.match(self._VALID_URL, url)
2825 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2828 self.report_webpage(url)
2830 request = compat_urllib_request.Request(url)
2832 webpage = compat_urllib_request.urlopen(request).read()
2833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2834 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2837 self.report_extraction(url)
2841 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2843 self._downloader.trouble(u'ERROR: unable to extract video url')
2845 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2849 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2851 self._downloader.trouble(u'ERROR: unable to extract video title')
2853 video_title = mobj.group(1).decode('utf-8')
2855 # Extract description
2856 video_description = u'No description available.'
2857 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2858 if mobj is not None:
2859 video_description = mobj.group(1).decode('utf-8')
2861 video_filename = video_url.split('/')[-1]
2862 video_id, extension = video_filename.split('.')
2868 'upload_date': None,
2869 'title': video_title,
2870 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2872 'description': video_description,
2877 class MixcloudIE(InfoExtractor):
2878 """Information extractor for www.mixcloud.com"""
2879 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2880 IE_NAME = u'mixcloud'
2882 def __init__(self, downloader=None):
2883 InfoExtractor.__init__(self, downloader)
2885 def report_download_json(self, file_id):
2886 """Report JSON download."""
2887 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2889 def report_extraction(self, file_id):
2890 """Report information extraction."""
2891 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2893 def get_urls(self, jsonData, fmt, bitrate='best'):
2894 """Get urls from 'audio_formats' section in json"""
2897 bitrate_list = jsonData[fmt]
2898 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2899 bitrate = max(bitrate_list) # select highest
2901 url_list = jsonData[fmt][bitrate]
2902 except TypeError: # we have no bitrate info.
2903 url_list = jsonData[fmt]
2906 def check_urls(self, url_list):
2907 """Returns 1st active url from list"""
2908 for url in url_list:
2910 compat_urllib_request.urlopen(url)
2912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2917 def _print_formats(self, formats):
2918 print('Available formats:')
2919 for fmt in formats.keys():
2920 for b in formats[fmt]:
2922 ext = formats[fmt][b][0]
2923 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2924 except TypeError: # we have no bitrate info
2925 ext = formats[fmt][0]
2926 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2929 def _real_extract(self, url):
2930 mobj = re.match(self._VALID_URL, url)
2932 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2934 # extract uploader & filename from url
2935 uploader = mobj.group(1).decode('utf-8')
2936 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2938 # construct API request
2939 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2940 # retrieve .json file with links to files
2941 request = compat_urllib_request.Request(file_url)
2943 self.report_download_json(file_url)
2944 jsonData = compat_urllib_request.urlopen(request).read()
2945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2946 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2950 json_data = json.loads(jsonData)
2951 player_url = json_data['player_swf_url']
2952 formats = dict(json_data['audio_formats'])
2954 req_format = self._downloader.params.get('format', None)
2957 if self._downloader.params.get('listformats', None):
2958 self._print_formats(formats)
2961 if req_format is None or req_format == 'best':
2962 for format_param in formats.keys():
2963 url_list = self.get_urls(formats, format_param)
2965 file_url = self.check_urls(url_list)
2966 if file_url is not None:
2969 if req_format not in formats.keys():
2970 self._downloader.trouble(u'ERROR: format is not available')
2973 url_list = self.get_urls(formats, req_format)
2974 file_url = self.check_urls(url_list)
2975 format_param = req_format
2978 'id': file_id.decode('utf-8'),
2979 'url': file_url.decode('utf-8'),
2980 'uploader': uploader.decode('utf-8'),
2981 'upload_date': None,
2982 'title': json_data['name'],
2983 'ext': file_url.split('.')[-1].decode('utf-8'),
2984 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2985 'thumbnail': json_data['thumbnail_url'],
2986 'description': json_data['description'],
2987 'player_url': player_url.decode('utf-8'),
2990 class StanfordOpenClassroomIE(InfoExtractor):
2991 """Information extractor for Stanford's Open ClassRoom"""
2993 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2994 IE_NAME = u'stanfordoc'
2996 def report_download_webpage(self, objid):
2997 """Report information extraction."""
2998 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3000 def report_extraction(self, video_id):
3001 """Report information extraction."""
3002 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3004 def _real_extract(self, url):
3005 mobj = re.match(self._VALID_URL, url)
3007 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3010 if mobj.group('course') and mobj.group('video'): # A specific video
3011 course = mobj.group('course')
3012 video = mobj.group('video')
3014 'id': course + '_' + video,
3016 'upload_date': None,
3019 self.report_extraction(info['id'])
3020 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3021 xmlUrl = baseUrl + video + '.xml'
3023 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3024 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3027 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3029 info['title'] = mdoc.findall('./title')[0].text
3030 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3032 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3034 info['ext'] = info['url'].rpartition('.')[2]
3036 elif mobj.group('course'): # A course page
3037 course = mobj.group('course')
3042 'upload_date': None,
3045 self.report_download_webpage(info['id'])
3047 coursepage = compat_urllib_request.urlopen(url).read()
3048 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3049 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3052 m = re.search('<h1>([^<]+)</h1>', coursepage)
3054 info['title'] = unescapeHTML(m.group(1))
3056 info['title'] = info['id']
3058 m = re.search('<description>([^<]+)</description>', coursepage)
3060 info['description'] = unescapeHTML(m.group(1))
3062 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3065 'type': 'reference',
3066 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3070 for entry in info['list']:
3071 assert entry['type'] == 'reference'
3072 results += self.extract(entry['url'])
3077 'id': 'Stanford OpenClassroom',
3080 'upload_date': None,
3083 self.report_download_webpage(info['id'])
3084 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3086 rootpage = compat_urllib_request.urlopen(rootURL).read()
3087 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3091 info['title'] = info['id']
3093 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3096 'type': 'reference',
3097 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3102 for entry in info['list']:
3103 assert entry['type'] == 'reference'
3104 results += self.extract(entry['url'])
3107 class MTVIE(InfoExtractor):
3108 """Information extractor for MTV.com"""
3110 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3113 def report_webpage(self, video_id):
3114 """Report information extraction."""
3115 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3117 def report_extraction(self, video_id):
3118 """Report information extraction."""
3119 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3121 def _real_extract(self, url):
3122 mobj = re.match(self._VALID_URL, url)
3124 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3126 if not mobj.group('proto'):
3127 url = 'http://' + url
3128 video_id = mobj.group('videoid')
3129 self.report_webpage(video_id)
3131 request = compat_urllib_request.Request(url)
3133 webpage = compat_urllib_request.urlopen(request).read()
3134 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3135 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3138 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3140 self._downloader.trouble(u'ERROR: unable to extract song name')
3142 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3143 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3145 self._downloader.trouble(u'ERROR: unable to extract performer')
3147 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3148 video_title = performer + ' - ' + song_name
3150 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3152 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3154 mtvn_uri = mobj.group(1)
3156 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3158 self._downloader.trouble(u'ERROR: unable to extract content id')
3160 content_id = mobj.group(1)
3162 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3163 self.report_extraction(video_id)
3164 request = compat_urllib_request.Request(videogen_url)
3166 metadataXml = compat_urllib_request.urlopen(request).read()
3167 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3168 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3171 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3172 renditions = mdoc.findall('.//rendition')
3174 # For now, always pick the highest quality.
3175 rendition = renditions[-1]
3178 _,_,ext = rendition.attrib['type'].partition('/')
3179 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3180 video_url = rendition.find('./src').text
3182 self._downloader.trouble('Invalid rendition field.')
3188 'uploader': performer,
3189 'upload_date': None,
3190 'title': video_title,
3198 class YoukuIE(InfoExtractor):
3200 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3203 def __init__(self, downloader=None):
3204 InfoExtractor.__init__(self, downloader)
3206 def report_download_webpage(self, file_id):
3207 """Report webpage download."""
3208 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3210 def report_extraction(self, file_id):
3211 """Report information extraction."""
3212 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3215 nowTime = int(time.time() * 1000)
3216 random1 = random.randint(1000,1998)
3217 random2 = random.randint(1000,9999)
3219 return "%d%d%d" %(nowTime,random1,random2)
3221 def _get_file_ID_mix_string(self, seed):
3223 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3225 for i in range(len(source)):
3226 seed = (seed * 211 + 30031 ) % 65536
3227 index = math.floor(seed / 65536 * len(source) )
3228 mixed.append(source[int(index)])
3229 source.remove(source[int(index)])
3230 #return ''.join(mixed)
3233 def _get_file_id(self, fileId, seed):
3234 mixed = self._get_file_ID_mix_string(seed)
3235 ids = fileId.split('*')
3239 realId.append(mixed[int(ch)])
3240 return ''.join(realId)
3242 def _real_extract(self, url):
3243 mobj = re.match(self._VALID_URL, url)
3245 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3247 video_id = mobj.group('ID')
3249 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3251 request = compat_urllib_request.Request(info_url, None, std_headers)
3253 self.report_download_webpage(video_id)
3254 jsondata = compat_urllib_request.urlopen(request).read()
3255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3256 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3259 self.report_extraction(video_id)
3261 jsonstr = jsondata.decode('utf-8')
3262 config = json.loads(jsonstr)
3264 video_title = config['data'][0]['title']
3265 seed = config['data'][0]['seed']
3267 format = self._downloader.params.get('format', None)
3268 supported_format = config['data'][0]['streamfileids'].keys()
3270 if format is None or format == 'best':
3271 if 'hd2' in supported_format:
3276 elif format == 'worst':
3284 fileid = config['data'][0]['streamfileids'][format]
3285 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3286 except (UnicodeDecodeError, ValueError, KeyError):
3287 self._downloader.trouble(u'ERROR: unable to extract info section')
3291 sid = self._gen_sid()
3292 fileid = self._get_file_id(fileid, seed)
3294 #column 8,9 of fileid represent the segment number
3295 #fileid[7:9] should be changed
3296 for index, key in enumerate(keys):
3298 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3299 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3302 'id': '%s_part%02d' % (video_id, index),
3303 'url': download_url,
3305 'upload_date': None,
3306 'title': video_title,
3309 files_info.append(info)
3314 class XNXXIE(InfoExtractor):
3315 """Information extractor for xnxx.com"""
3317 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3319 VIDEO_URL_RE = r'flv_url=(.*?)&'
3320 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3321 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3323 def report_webpage(self, video_id):
3324 """Report information extraction"""
3325 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3327 def report_extraction(self, video_id):
3328 """Report information extraction"""
3329 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3331 def _real_extract(self, url):
3332 mobj = re.match(self._VALID_URL, url)
3334 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3336 video_id = mobj.group(1)
3338 self.report_webpage(video_id)
3340 # Get webpage content
3342 webpage_bytes = compat_urllib_request.urlopen(url).read()
3343 webpage = webpage_bytes.decode('utf-8')
3344 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3345 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3348 result = re.search(self.VIDEO_URL_RE, webpage)
3350 self._downloader.trouble(u'ERROR: unable to extract video url')
3352 video_url = compat_urllib_parse.unquote(result.group(1))
3354 result = re.search(self.VIDEO_TITLE_RE, webpage)
3356 self._downloader.trouble(u'ERROR: unable to extract video title')
3358 video_title = result.group(1)
3360 result = re.search(self.VIDEO_THUMB_RE, webpage)
3362 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3364 video_thumbnail = result.group(1)
3370 'upload_date': None,
3371 'title': video_title,
3373 'thumbnail': video_thumbnail,
3374 'description': None,
3378 class GooglePlusIE(InfoExtractor):
3379 """Information extractor for plus.google.com."""
3381 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3382 IE_NAME = u'plus.google'
3384 def __init__(self, downloader=None):
3385 InfoExtractor.__init__(self, downloader)
3387 def report_extract_entry(self, url):
3388 """Report downloading extry"""
3389 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3391 def report_date(self, upload_date):
3392 """Report downloading extry"""
3393 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3395 def report_uploader(self, uploader):
3396 """Report downloading extry"""
3397 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3399 def report_title(self, video_title):
3400 """Report downloading extry"""
3401 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3403 def report_extract_vid_page(self, video_page):
3404 """Report information extraction."""
3405 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3407 def _real_extract(self, url):
3408 # Extract id from URL
3409 mobj = re.match(self._VALID_URL, url)
3411 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3414 post_url = mobj.group(0)
3415 video_id = mobj.group(2)
3417 video_extension = 'flv'
3419 # Step 1, Retrieve post webpage to extract further information
3420 self.report_extract_entry(post_url)
3421 request = compat_urllib_request.Request(post_url)
3423 webpage = compat_urllib_request.urlopen(request).read()
3424 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3428 # Extract update date
3430 pattern = 'title="Timestamp">(.*?)</a>'
3431 mobj = re.search(pattern, webpage)
3433 upload_date = mobj.group(1)
3434 # Convert timestring to a format suitable for filename
3435 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3436 upload_date = upload_date.strftime('%Y%m%d')
3437 self.report_date(upload_date)
3441 pattern = r'rel\="author".*?>(.*?)</a>'
3442 mobj = re.search(pattern, webpage)
3444 uploader = mobj.group(1)
3445 self.report_uploader(uploader)
3448 # Get the first line for title
3450 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3451 mobj = re.search(pattern, webpage)
3453 video_title = mobj.group(1)
3454 self.report_title(video_title)
3456 # Step 2, Stimulate clicking the image box to launch video
3457 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3458 mobj = re.search(pattern, webpage)
3460 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3462 video_page = mobj.group(1)
3463 request = compat_urllib_request.Request(video_page)
3465 webpage = compat_urllib_request.urlopen(request).read()
3466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3467 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3469 self.report_extract_vid_page(video_page)
3472 # Extract video links on video page
3473 """Extract video links of all sizes"""
3474 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3475 mobj = re.findall(pattern, webpage)
3477 self._downloader.trouble(u'ERROR: unable to extract video links')
3479 # Sort in resolution
3480 links = sorted(mobj)
3482 # Choose the lowest of the sort, i.e. highest resolution
3483 video_url = links[-1]
3484 # Only get the url. The resolution part in the tuple has no use anymore
3485 video_url = video_url[-1]
3486 # Treat escaped \u0026 style hex
3487 video_url = unicode(video_url, "unicode_escape")
3491 'id': video_id.decode('utf-8'),
3493 'uploader': uploader.decode('utf-8'),
3494 'upload_date': upload_date.decode('utf-8'),
3495 'title': video_title.decode('utf-8'),
3496 'ext': video_extension.decode('utf-8'),
3499 class NBAIE(InfoExtractor):
3500 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3503 def report_extraction(self, video_id):
3504 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3506 def _real_extract(self, url):
3507 mobj = re.match(self._VALID_URL, url)
3509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3512 video_id = mobj.group(1)
3513 if video_id.endswith('/index.html'):
3514 video_id = video_id[:-len('/index.html')]
3516 self.report_extraction(video_id)
3518 urlh = compat_urllib_request.urlopen(url)
3519 webpage_bytes = urlh.read()
3520 webpage = webpage_bytes.decode('utf-8', 'ignore')
3521 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3522 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3525 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3526 def _findProp(rexp, default=None):
3527 m = re.search(rexp, webpage)
3529 return unescapeHTML(m.group(1))
3533 shortened_video_id = video_id.rpartition('/')[2]
3534 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3536 'id': shortened_video_id,
3540 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3541 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3545 class JustinTVIE(InfoExtractor):
3546 """Information extractor for justin.tv and twitch.tv"""
3547 # TODO: One broadcast may be split into multiple videos. The key
3548 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3549 # starts at 1 and increases. Can we treat all parts as one video?
3551 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3552 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3553 _JUSTIN_PAGE_LIMIT = 100
3554 IE_NAME = u'justin.tv'
3556 def report_extraction(self, file_id):
3557 """Report information extraction."""
3558 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3560 def report_download_page(self, channel, offset):
3561 """Report attempt to download a single page of videos."""
3562 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3563 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3565 # Return count of items, list of *valid* items
3566 def _parse_page(self, url):
3568 urlh = compat_urllib_request.urlopen(url)
3569 webpage_bytes = urlh.read()
3570 webpage = webpage_bytes.decode('utf-8', 'ignore')
3571 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3572 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3575 response = json.loads(webpage)
3577 for clip in response:
3578 video_url = clip['video_file_url']
3580 video_extension = os.path.splitext(video_url)[1][1:]
3581 video_date = re.sub('-', '', clip['created_on'][:10])
3585 'title': clip['title'],
3586 'uploader': clip.get('user_id', clip.get('channel_id')),
3587 'upload_date': video_date,
3588 'ext': video_extension,
3590 return (len(response), info)
3592 def _real_extract(self, url):
3593 mobj = re.match(self._VALID_URL, url)
3595 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3598 api = 'http://api.justin.tv'
3599 video_id = mobj.group(mobj.lastindex)
3601 if mobj.lastindex == 1:
3603 api += '/channel/archives/%s.json'
3605 api += '/clip/show/%s.json'
3606 api = api % (video_id,)
3608 self.report_extraction(video_id)
3612 limit = self._JUSTIN_PAGE_LIMIT
3615 self.report_download_page(video_id, offset)
3616 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3617 page_count, page_info = self._parse_page(page_url)
3618 info.extend(page_info)
3619 if not paged or page_count != limit: