2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
70 self.set_downloader(downloader)
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _print_formats(self, formats):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
227 def _real_initialize(self):
228 if self._downloader is None:
233 downloader_params = self._downloader.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params.get('username', None) is not None:
237 username = downloader_params['username']
238 password = downloader_params['password']
239 elif downloader_params.get('usenetrc', False):
241 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
246 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
247 except (IOError, netrc.NetrcParseError) as err:
248 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
252 request = compat_urllib_request.Request(self._LANG_URL)
255 compat_urllib_request.urlopen(request).read()
256 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
257 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username,
270 'password': password,
272 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
275 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
276 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
277 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
286 'action_confirm': 'Confirm',
288 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
290 self.report_age_confirmation()
291 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
296 def _real_extract(self, url):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj = re.search(self._NEXT_URL_RE, url)
300 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
305 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
307 video_id = mobj.group(2)
310 self.report_video_webpage_download(video_id)
311 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
313 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
318 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
320 # Attempt to extract SWF player URL
321 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
323 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
328 self.report_video_info_webpage_download(video_id)
329 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id, el_type))
332 request = compat_urllib_request.Request(video_info_url)
334 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
335 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
336 video_info = compat_parse_qs(video_info_webpage)
337 if 'token' in video_info:
339 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
340 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
342 if 'token' not in video_info:
343 if 'reason' in video_info:
344 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
346 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
351 self._downloader.trouble(u'ERROR: "rental" videos not supported')
354 # Start extracting information
355 self.report_information_extraction(video_id)
358 if 'author' not in video_info:
359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
361 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
364 if 'title' not in video_info:
365 self._downloader.trouble(u'ERROR: unable to extract video title')
367 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
370 if 'thumbnail_url' not in video_info:
371 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
373 else: # don't panic if we can't find it
374 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
378 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
380 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
381 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression in format_expressions:
384 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
389 video_description = get_element_by_id("eow-description", video_webpage)
390 if video_description:
391 video_description = clean_html(video_description)
393 video_description = ''
396 video_subtitles = None
397 if self._downloader.params.get('writesubtitles', False):
399 self.report_video_subtitles_download(video_id)
400 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
402 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
405 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
406 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
407 if not srt_lang_list:
408 raise Trouble(u'WARNING: video has no closed captions')
409 if self._downloader.params.get('subtitleslang', False):
410 srt_lang = self._downloader.params.get('subtitleslang')
411 elif 'en' in srt_lang_list:
414 srt_lang = srt_lang_list.keys()[0]
415 if not srt_lang in srt_lang_list:
416 raise Trouble(u'WARNING: no closed captions found in the specified language')
417 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
419 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
421 raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
423 raise Trouble(u'WARNING: unable to download video subtitles')
424 video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
425 except Trouble as trouble:
426 self._downloader.trouble(str(trouble))
428 if 'length_seconds' not in video_info:
429 self._downloader.trouble(u'WARNING: unable to extract video duration')
432 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
435 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
437 # Decide which formats to download
438 req_format = self._downloader.params.get('format', None)
440 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
441 self.report_rtmp_download()
442 video_url_list = [(None, video_info['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
445 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
446 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
447 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
449 format_limit = self._downloader.params.get('format_limit', None)
450 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
451 if format_limit is not None and format_limit in available_formats:
452 format_list = available_formats[available_formats.index(format_limit):]
454 format_list = available_formats
455 existing_formats = [x for x in format_list if x in url_map]
456 if len(existing_formats) == 0:
457 self._downloader.trouble(u'ERROR: no known formats available for video')
459 if self._downloader.params.get('listformats', None):
460 self._print_formats(existing_formats)
462 if req_format is None or req_format == 'best':
463 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
464 elif req_format == 'worst':
465 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
466 elif req_format in ('-1', 'all'):
467 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats = req_format.split('/')
472 video_url_list = None
473 for rf in req_formats:
475 video_url_list = [(rf, url_map[rf])]
477 if video_url_list is None:
478 self._downloader.trouble(u'ERROR: requested format not available')
481 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
485 for format_param, video_real_url in video_url_list:
487 video_extension = self._video_extensions.get(format_param, 'flv')
489 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
490 self._video_dimensions.get(format_param, '???'))
494 'url': video_real_url,
495 'uploader': video_uploader,
496 'upload_date': upload_date,
497 'title': video_title,
498 'ext': video_extension,
499 'format': video_format,
500 'thumbnail': video_thumbnail,
501 'description': video_description,
502 'player_url': player_url,
503 'subtitles': video_subtitles,
504 'duration': video_duration
509 class MetacafeIE(InfoExtractor):
510 """Information Extractor for metacafe.com."""
512 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME = u'metacafe'
517 def __init__(self, downloader=None):
518 InfoExtractor.__init__(self, downloader)
520 def report_disclaimer(self):
521 """Report disclaimer retrieval."""
522 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
524 def report_age_confirmation(self):
525 """Report attempt to confirm age."""
526 self._downloader.to_screen(u'[metacafe] Confirming age')
528 def report_download_webpage(self, video_id):
529 """Report webpage download."""
530 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
532 def report_extraction(self, video_id):
533 """Report information extraction."""
534 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
536 def _real_initialize(self):
537 # Retrieve disclaimer
538 request = compat_urllib_request.Request(self._DISCLAIMER)
540 self.report_disclaimer()
541 disclaimer = compat_urllib_request.urlopen(request).read()
542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
543 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
549 'submit': "Continue - I'm over 18",
551 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
553 self.report_age_confirmation()
554 disclaimer = compat_urllib_request.urlopen(request).read()
555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
556 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
559 def _real_extract(self, url):
560 # Extract id and simplified title from URL
561 mobj = re.match(self._VALID_URL, url)
563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
566 video_id = mobj.group(1)
568 # Check if video comes from YouTube
569 mobj2 = re.match(r'^yt-(.*)$', video_id)
570 if mobj2 is not None:
571 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
574 # Retrieve video webpage to extract further information
575 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
577 self.report_download_webpage(video_id)
578 webpage = compat_urllib_request.urlopen(request).read()
579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
580 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
583 # Extract URL, uploader and title from webpage
584 self.report_extraction(video_id)
585 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
587 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
588 video_extension = mediaURL[-3:]
590 # Extract gdaKey if available
591 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
595 gdaKey = mobj.group(1)
596 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
598 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
600 self._downloader.trouble(u'ERROR: unable to extract media URL')
602 vardict = compat_parse_qs(mobj.group(1))
603 if 'mediaData' not in vardict:
604 self._downloader.trouble(u'ERROR: unable to extract media URL')
606 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
608 self._downloader.trouble(u'ERROR: unable to extract media URL')
610 mediaURL = mobj.group(1).replace('\\/', '/')
611 video_extension = mediaURL[-3:]
612 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
614 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
616 self._downloader.trouble(u'ERROR: unable to extract title')
618 video_title = mobj.group(1).decode('utf-8')
620 mobj = re.search(r'submitter=(.*?);', webpage)
622 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
624 video_uploader = mobj.group(1)
627 'id': video_id.decode('utf-8'),
628 'url': video_url.decode('utf-8'),
629 'uploader': video_uploader.decode('utf-8'),
631 'title': video_title,
632 'ext': video_extension.decode('utf-8'),
636 class DailymotionIE(InfoExtractor):
637 """Information Extractor for Dailymotion"""
639 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME = u'dailymotion'
642 def __init__(self, downloader=None):
643 InfoExtractor.__init__(self, downloader)
645 def report_download_webpage(self, video_id):
646 """Report webpage download."""
647 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
649 def report_extraction(self, video_id):
650 """Report information extraction."""
651 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
653 def _real_extract(self, url):
654 # Extract id and simplified title from URL
655 mobj = re.match(self._VALID_URL, url)
657 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
660 video_id = mobj.group(1).split('_')[0].split('?')[0]
662 video_extension = 'mp4'
664 # Retrieve video webpage to extract further information
665 request = compat_urllib_request.Request(url)
666 request.add_header('Cookie', 'family_filter=off')
668 self.report_download_webpage(video_id)
669 webpage_bytes = compat_urllib_request.urlopen(request).read()
670 webpage = webpage_bytes.decode('utf-8')
671 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
672 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
675 # Extract URL, uploader and title from webpage
676 self.report_extraction(video_id)
677 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
679 self._downloader.trouble(u'ERROR: unable to extract media URL')
681 flashvars = compat_urllib_parse.unquote(mobj.group(1))
683 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
686 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
689 self._downloader.trouble(u'ERROR: unable to extract video URL')
692 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
694 self._downloader.trouble(u'ERROR: unable to extract video URL')
697 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
699 # TODO: support choosing qualities
701 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
703 self._downloader.trouble(u'ERROR: unable to extract title')
705 video_title = unescapeHTML(mobj.group('title'))
707 video_uploader = None
708 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
710 # lookin for official user
711 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
712 if mobj_official is None:
713 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
715 video_uploader = mobj_official.group(1)
717 video_uploader = mobj.group(1)
719 video_upload_date = None
720 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
722 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
727 'uploader': video_uploader,
728 'upload_date': video_upload_date,
729 'title': video_title,
730 'ext': video_extension,
734 class PhotobucketIE(InfoExtractor):
735 """Information extractor for photobucket.com."""
737 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
738 IE_NAME = u'photobucket'
740 def __init__(self, downloader=None):
741 InfoExtractor.__init__(self, downloader)
743 def report_download_webpage(self, video_id):
744 """Report webpage download."""
745 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
747 def report_extraction(self, video_id):
748 """Report information extraction."""
749 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
751 def _real_extract(self, url):
752 # Extract id from URL
753 mobj = re.match(self._VALID_URL, url)
755 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
758 video_id = mobj.group(1)
760 video_extension = 'flv'
762 # Retrieve video webpage to extract further information
763 request = compat_urllib_request.Request(url)
765 self.report_download_webpage(video_id)
766 webpage = compat_urllib_request.urlopen(request).read()
767 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
768 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
771 # Extract URL, uploader, and title from webpage
772 self.report_extraction(video_id)
773 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
775 self._downloader.trouble(u'ERROR: unable to extract media URL')
777 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
781 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
783 self._downloader.trouble(u'ERROR: unable to extract title')
785 video_title = mobj.group(1).decode('utf-8')
787 video_uploader = mobj.group(2).decode('utf-8')
790 'id': video_id.decode('utf-8'),
791 'url': video_url.decode('utf-8'),
792 'uploader': video_uploader,
794 'title': video_title,
795 'ext': video_extension.decode('utf-8'),
799 class YahooIE(InfoExtractor):
800 """Information extractor for video.yahoo.com."""
803 # _VALID_URL matches all Yahoo! Video URLs
804 # _VPAGE_URL matches only the extractable '/watch/' URLs
805 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
806 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
807 IE_NAME = u'video.yahoo'
809 def __init__(self, downloader=None):
810 InfoExtractor.__init__(self, downloader)
812 def report_download_webpage(self, video_id):
813 """Report webpage download."""
814 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
816 def report_extraction(self, video_id):
817 """Report information extraction."""
818 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
820 def _real_extract(self, url, new_video=True):
821 # Extract ID from URL
822 mobj = re.match(self._VALID_URL, url)
824 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
827 video_id = mobj.group(2)
828 video_extension = 'flv'
830 # Rewrite valid but non-extractable URLs as
831 # extractable English language /watch/ URLs
832 if re.match(self._VPAGE_URL, url) is None:
833 request = compat_urllib_request.Request(url)
835 webpage = compat_urllib_request.urlopen(request).read()
836 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
837 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
840 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
842 self._downloader.trouble(u'ERROR: Unable to extract id field')
844 yahoo_id = mobj.group(1)
846 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
848 self._downloader.trouble(u'ERROR: Unable to extract vid field')
850 yahoo_vid = mobj.group(1)
852 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
853 return self._real_extract(url, new_video=False)
855 # Retrieve video webpage to extract further information
856 request = compat_urllib_request.Request(url)
858 self.report_download_webpage(video_id)
859 webpage = compat_urllib_request.urlopen(request).read()
860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
861 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
864 # Extract uploader and title from webpage
865 self.report_extraction(video_id)
866 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
868 self._downloader.trouble(u'ERROR: unable to extract video title')
870 video_title = mobj.group(1).decode('utf-8')
872 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
874 self._downloader.trouble(u'ERROR: unable to extract video uploader')
876 video_uploader = mobj.group(1).decode('utf-8')
878 # Extract video thumbnail
879 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
881 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
883 video_thumbnail = mobj.group(1).decode('utf-8')
885 # Extract video description
886 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
888 self._downloader.trouble(u'ERROR: unable to extract video description')
890 video_description = mobj.group(1).decode('utf-8')
891 if not video_description:
892 video_description = 'No description available.'
894 # Extract video height and width
895 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
897 self._downloader.trouble(u'ERROR: unable to extract video height')
899 yv_video_height = mobj.group(1)
901 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video width')
905 yv_video_width = mobj.group(1)
907 # Retrieve video playlist to extract media URL
908 # I'm not completely sure what all these options are, but we
909 # seem to need most of them, otherwise the server sends a 401.
910 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
911 yv_bitrate = '700' # according to Wikipedia this is hard-coded
912 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
913 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
914 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
916 self.report_download_webpage(video_id)
917 webpage = compat_urllib_request.urlopen(request).read()
918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
919 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
922 # Extract media URL from playlist XML
923 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
925 self._downloader.trouble(u'ERROR: Unable to extract media URL')
927 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
928 video_url = unescapeHTML(video_url)
931 'id': video_id.decode('utf-8'),
933 'uploader': video_uploader,
935 'title': video_title,
936 'ext': video_extension.decode('utf-8'),
937 'thumbnail': video_thumbnail.decode('utf-8'),
938 'description': video_description,
942 class VimeoIE(InfoExtractor):
943 """Information extractor for vimeo.com."""
945 # _VALID_URL matches Vimeo URLs
946 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
949 def __init__(self, downloader=None):
950 InfoExtractor.__init__(self, downloader)
952 def report_download_webpage(self, video_id):
953 """Report webpage download."""
954 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
956 def report_extraction(self, video_id):
957 """Report information extraction."""
958 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
960 def _real_extract(self, url, new_video=True):
961 # Extract ID from URL
962 mobj = re.match(self._VALID_URL, url)
964 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
967 video_id = mobj.group(1)
969 # Retrieve video webpage to extract further information
970 request = compat_urllib_request.Request(url, None, std_headers)
972 self.report_download_webpage(video_id)
973 webpage_bytes = compat_urllib_request.urlopen(request).read()
974 webpage = webpage_bytes.decode('utf-8')
975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
976 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
979 # Now we begin extracting as much information as we can from what we
980 # retrieved. First we extract the information common to all extractors,
981 # and latter we extract those that are Vimeo specific.
982 self.report_extraction(video_id)
984 # Extract the config JSON
986 config = webpage.split(' = {config:')[1].split(',assets:')[0]
987 config = json.loads(config)
989 self._downloader.trouble(u'ERROR: unable to extract info section')
993 video_title = config["video"]["title"]
996 video_uploader = config["video"]["owner"]["name"]
998 # Extract video thumbnail
999 video_thumbnail = config["video"]["thumbnail"]
1001 # Extract video description
1002 video_description = get_element_by_id("description", webpage)
1003 if video_description: video_description = clean_html(video_description)
1004 else: video_description = ''
1006 # Extract upload date
1007 video_upload_date = None
1008 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1009 if mobj is not None:
1010 video_upload_date = mobj.group(1)
1012 # Vimeo specific: extract request signature and timestamp
1013 sig = config['request']['signature']
1014 timestamp = config['request']['timestamp']
1016 # Vimeo specific: extract video codec and quality information
1017 # First consider quality, then codecs, then take everything
1018 # TODO bind to format param
1019 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1020 files = { 'hd': [], 'sd': [], 'other': []}
1021 for codec_name, codec_extension in codecs:
1022 if codec_name in config["video"]["files"]:
1023 if 'hd' in config["video"]["files"][codec_name]:
1024 files['hd'].append((codec_name, codec_extension, 'hd'))
1025 elif 'sd' in config["video"]["files"][codec_name]:
1026 files['sd'].append((codec_name, codec_extension, 'sd'))
1028 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1030 for quality in ('hd', 'sd', 'other'):
1031 if len(files[quality]) > 0:
1032 video_quality = files[quality][0][2]
1033 video_codec = files[quality][0][0]
1034 video_extension = files[quality][0][1]
1035 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1038 self._downloader.trouble(u'ERROR: no known codec found')
1041 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1042 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1047 'uploader': video_uploader,
1048 'upload_date': video_upload_date,
1049 'title': video_title,
1050 'ext': video_extension,
1051 'thumbnail': video_thumbnail,
1052 'description': video_description,
1056 class ArteTvIE(InfoExtractor):
1057 """arte.tv information extractor."""
1059 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1060 _LIVE_URL = r'index-[0-9]+\.html$'
1062 IE_NAME = u'arte.tv'
1064 def __init__(self, downloader=None):
1065 InfoExtractor.__init__(self, downloader)
1067 def report_download_webpage(self, video_id):
1068 """Report webpage download."""
1069 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1071 def report_extraction(self, video_id):
1072 """Report information extraction."""
1073 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1075 def fetch_webpage(self, url):
1076 self._downloader.increment_downloads()
1077 request = compat_urllib_request.Request(url)
1079 self.report_download_webpage(url)
1080 webpage = compat_urllib_request.urlopen(request).read()
1081 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1082 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1084 except ValueError as err:
1085 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1089 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1090 page = self.fetch_webpage(url)
1091 mobj = re.search(regex, page, regexFlags)
1095 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1098 for (i, key, err) in matchTuples:
1099 if mobj.group(i) is None:
1100 self._downloader.trouble(err)
1103 info[key] = mobj.group(i)
1107 def extractLiveStream(self, url):
1108 video_lang = url.split('/')[-4]
1109 info = self.grep_webpage(
1111 r'src="(.*?/videothek_js.*?\.js)',
1114 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1117 http_host = url.split('/')[2]
1118 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1119 info = self.grep_webpage(
1121 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1122 '(http://.*?\.swf).*?' +
1126 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1127 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1128 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1131 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1133 def extractPlus7Stream(self, url):
1134 video_lang = url.split('/')[-3]
1135 info = self.grep_webpage(
1137 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1140 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1143 next_url = compat_urllib_parse.unquote(info.get('url'))
1144 info = self.grep_webpage(
1146 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1149 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1152 next_url = compat_urllib_parse.unquote(info.get('url'))
1154 info = self.grep_webpage(
1156 r'<video id="(.*?)".*?>.*?' +
1157 '<name>(.*?)</name>.*?' +
1158 '<dateVideo>(.*?)</dateVideo>.*?' +
1159 '<url quality="hd">(.*?)</url>',
1162 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1163 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1164 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1165 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1170 'id': info.get('id'),
1171 'url': compat_urllib_parse.unquote(info.get('url')),
1172 'uploader': u'arte.tv',
1173 'upload_date': info.get('date'),
1174 'title': info.get('title').decode('utf-8'),
1180 def _real_extract(self, url):
1181 video_id = url.split('/')[-1]
1182 self.report_extraction(video_id)
1184 if re.search(self._LIVE_URL, video_id) is not None:
1185 self.extractLiveStream(url)
1188 info = self.extractPlus7Stream(url)
1193 class GenericIE(InfoExtractor):
1194 """Generic last-resort information extractor."""
1197 IE_NAME = u'generic'
1199 def __init__(self, downloader=None):
1200 InfoExtractor.__init__(self, downloader)
1202 def report_download_webpage(self, video_id):
1203 """Report webpage download."""
1204 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1205 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1207 def report_extraction(self, video_id):
1208 """Report information extraction."""
1209 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1211 def report_following_redirect(self, new_url):
1212 """Report information extraction."""
1213 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1215 def _test_redirect(self, url):
1216 """Check if it is a redirect, like url shorteners, in case restart chain."""
1217 class HeadRequest(compat_urllib_request.Request):
1218 def get_method(self):
1221 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1223 Subclass the HTTPRedirectHandler to make it use our
1224 HeadRequest also on the redirected URL
1226 def redirect_request(self, req, fp, code, msg, headers, newurl):
1227 if code in (301, 302, 303, 307):
1228 newurl = newurl.replace(' ', '%20')
1229 newheaders = dict((k,v) for k,v in req.headers.items()
1230 if k.lower() not in ("content-length", "content-type"))
1231 return HeadRequest(newurl,
1233 origin_req_host=req.get_origin_req_host(),
1236 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1238 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1240 Fallback to GET if HEAD is not allowed (405 HTTP error)
1242 def http_error_405(self, req, fp, code, msg, headers):
1246 newheaders = dict((k,v) for k,v in req.headers.items()
1247 if k.lower() not in ("content-length", "content-type"))
1248 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1250 origin_req_host=req.get_origin_req_host(),
1254 opener = compat_urllib_request.OpenerDirector()
1255 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1256 HTTPMethodFallback, HEADRedirectHandler,
1257 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1258 opener.add_handler(handler())
1260 response = opener.open(HeadRequest(url))
1261 new_url = response.geturl()
1266 self.report_following_redirect(new_url)
1267 self._downloader.download([new_url])
1270 def _real_extract(self, url):
1271 if self._test_redirect(url): return
1273 video_id = url.split('/')[-1]
1274 request = compat_urllib_request.Request(url)
1276 self.report_download_webpage(video_id)
1277 webpage = compat_urllib_request.urlopen(request).read()
1278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1281 except ValueError as err:
1282 # since this is the last-resort InfoExtractor, if
1283 # this error is thrown, it'll be thrown here
1284 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1287 self.report_extraction(video_id)
1288 # Start with something easy: JW Player in SWFObject
1289 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1291 # Broaden the search a little bit
1292 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1294 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1297 # It's possible that one of the regexes
1298 # matched, but returned an empty group:
1299 if mobj.group(1) is None:
1300 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1303 video_url = compat_urllib_parse.unquote(mobj.group(1))
1304 video_id = os.path.basename(video_url)
1306 # here's a fun little line of code for you:
1307 video_extension = os.path.splitext(video_id)[1][1:]
1308 video_id = os.path.splitext(video_id)[0]
1310 # it's tempting to parse this further, but you would
1311 # have to take into account all the variations like
1312 # Video Title - Site Name
1313 # Site Name | Video Title
1314 # Video Title - Tagline | Site Name
1315 # and so on and so forth; it's just not practical
1316 mobj = re.search(r'<title>(.*)</title>', webpage)
1318 self._downloader.trouble(u'ERROR: unable to extract title')
1320 video_title = mobj.group(1)
1322 # video uploader is domain name
1323 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1325 self._downloader.trouble(u'ERROR: unable to extract title')
1327 video_uploader = mobj.group(1)
1332 'uploader': video_uploader,
1333 'upload_date': None,
1334 'title': video_title,
1335 'ext': video_extension,
1339 class YoutubeSearchIE(InfoExtractor):
1340 """Information Extractor for YouTube search queries."""
1341 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343 _max_youtube_results = 1000
1344 IE_NAME = u'youtube:search'
1346 def __init__(self, downloader=None):
1347 InfoExtractor.__init__(self, downloader)
1349 def report_download_page(self, query, pagenum):
1350 """Report attempt to download search page with given number."""
1351 query = query.decode(preferredencoding())
1352 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1354 def _real_extract(self, query):
1355 mobj = re.match(self._VALID_URL, query)
1357 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1360 prefix, query = query.split(':')
1362 query = query.encode('utf-8')
1364 self._download_n_results(query, 1)
1366 elif prefix == 'all':
1367 self._download_n_results(query, self._max_youtube_results)
1373 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1375 elif n > self._max_youtube_results:
1376 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1377 n = self._max_youtube_results
1378 self._download_n_results(query, n)
1380 except ValueError: # parsing prefix as integer fails
1381 self._download_n_results(query, 1)
1384 def _download_n_results(self, query, n):
1385 """Downloads a specified number of results for a query"""
1391 while (50 * pagenum) < limit:
1392 self.report_download_page(query, pagenum+1)
1393 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1394 request = compat_urllib_request.Request(result_url)
1396 data = compat_urllib_request.urlopen(request).read()
1397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1398 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1400 api_response = json.loads(data)['data']
1402 new_ids = list(video['id'] for video in api_response['items'])
1403 video_ids += new_ids
1405 limit = min(n, api_response['totalItems'])
1408 if len(video_ids) > n:
1409 video_ids = video_ids[:n]
1410 for id in video_ids:
1411 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1415 class GoogleSearchIE(InfoExtractor):
1416 """Information Extractor for Google Video search queries."""
1417 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1418 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1419 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1420 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1421 _max_google_results = 1000
1422 IE_NAME = u'video.google:search'
1424 def __init__(self, downloader=None):
1425 InfoExtractor.__init__(self, downloader)
1427 def report_download_page(self, query, pagenum):
1428 """Report attempt to download playlist page with given number."""
1429 query = query.decode(preferredencoding())
1430 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1432 def _real_extract(self, query):
1433 mobj = re.match(self._VALID_URL, query)
1435 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1438 prefix, query = query.split(':')
1440 query = query.encode('utf-8')
1442 self._download_n_results(query, 1)
1444 elif prefix == 'all':
1445 self._download_n_results(query, self._max_google_results)
1451 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1453 elif n > self._max_google_results:
1454 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1455 n = self._max_google_results
1456 self._download_n_results(query, n)
1458 except ValueError: # parsing prefix as integer fails
1459 self._download_n_results(query, 1)
1462 def _download_n_results(self, query, n):
1463 """Downloads a specified number of results for a query"""
1469 self.report_download_page(query, pagenum)
1470 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1471 request = compat_urllib_request.Request(result_url)
1473 page = compat_urllib_request.urlopen(request).read()
1474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1478 # Extract video identifiers
1479 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1480 video_id = mobj.group(1)
1481 if video_id not in video_ids:
1482 video_ids.append(video_id)
1483 if len(video_ids) == n:
1484 # Specified n videos reached
1485 for id in video_ids:
1486 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1489 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1490 for id in video_ids:
1491 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1494 pagenum = pagenum + 1
1497 class YahooSearchIE(InfoExtractor):
1498 """Information Extractor for Yahoo! Video search queries."""
1501 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1502 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1503 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1504 _MORE_PAGES_INDICATOR = r'\s*Next'
1505 _max_yahoo_results = 1000
1506 IE_NAME = u'video.yahoo:search'
1508 def __init__(self, downloader=None):
1509 InfoExtractor.__init__(self, downloader)
1511 def report_download_page(self, query, pagenum):
1512 """Report attempt to download playlist page with given number."""
1513 query = query.decode(preferredencoding())
1514 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1516 def _real_extract(self, query):
1517 mobj = re.match(self._VALID_URL, query)
1519 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1522 prefix, query = query.split(':')
1524 query = query.encode('utf-8')
1526 self._download_n_results(query, 1)
1528 elif prefix == 'all':
1529 self._download_n_results(query, self._max_yahoo_results)
1535 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1537 elif n > self._max_yahoo_results:
1538 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1539 n = self._max_yahoo_results
1540 self._download_n_results(query, n)
1542 except ValueError: # parsing prefix as integer fails
1543 self._download_n_results(query, 1)
1546 def _download_n_results(self, query, n):
1547 """Downloads a specified number of results for a query"""
1550 already_seen = set()
1554 self.report_download_page(query, pagenum)
1555 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1556 request = compat_urllib_request.Request(result_url)
1558 page = compat_urllib_request.urlopen(request).read()
1559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1563 # Extract video identifiers
1564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565 video_id = mobj.group(1)
1566 if video_id not in already_seen:
1567 video_ids.append(video_id)
1568 already_seen.add(video_id)
1569 if len(video_ids) == n:
1570 # Specified n videos reached
1571 for id in video_ids:
1572 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1575 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1576 for id in video_ids:
1577 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1580 pagenum = pagenum + 1
1583 class YoutubePlaylistIE(InfoExtractor):
1584 """Information Extractor for YouTube playlists."""
1586 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1587 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1588 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1589 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1590 IE_NAME = u'youtube:playlist'
1592 def __init__(self, downloader=None):
1593 InfoExtractor.__init__(self, downloader)
1595 def report_download_page(self, playlist_id, pagenum):
1596 """Report attempt to download playlist page with given number."""
1597 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1599 def _real_extract(self, url):
1600 # Extract playlist id
1601 mobj = re.match(self._VALID_URL, url)
1603 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1607 if mobj.group(3) is not None:
1608 self._downloader.download([mobj.group(3)])
1611 # Download playlist pages
1612 # prefix is 'p' as default for playlists but there are other types that need extra care
1613 playlist_prefix = mobj.group(1)
1614 if playlist_prefix == 'a':
1615 playlist_access = 'artist'
1617 playlist_prefix = 'p'
1618 playlist_access = 'view_play_list'
1619 playlist_id = mobj.group(2)
1624 self.report_download_page(playlist_id, pagenum)
1625 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1626 request = compat_urllib_request.Request(url)
1628 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1629 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1633 # Extract video identifiers
1635 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1636 if mobj.group(1) not in ids_in_page:
1637 ids_in_page.append(mobj.group(1))
1638 video_ids.extend(ids_in_page)
1640 if self._MORE_PAGES_INDICATOR not in page:
1642 pagenum = pagenum + 1
1644 total = len(video_ids)
1646 playliststart = self._downloader.params.get('playliststart', 1) - 1
1647 playlistend = self._downloader.params.get('playlistend', -1)
1648 if playlistend == -1:
1649 video_ids = video_ids[playliststart:]
1651 video_ids = video_ids[playliststart:playlistend]
1653 if len(video_ids) == total:
1654 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1656 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1658 for id in video_ids:
1659 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1663 class YoutubeChannelIE(InfoExtractor):
1664 """Information Extractor for YouTube channels."""
1666 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1667 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1668 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669 IE_NAME = u'youtube:channel'
1671 def report_download_page(self, channel_id, pagenum):
1672 """Report attempt to download channel page with given number."""
1673 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1675 def _real_extract(self, url):
1676 # Extract channel id
1677 mobj = re.match(self._VALID_URL, url)
1679 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1682 # Download channel pages
1683 channel_id = mobj.group(1)
1688 self.report_download_page(channel_id, pagenum)
1689 url = self._TEMPLATE_URL % (channel_id, pagenum)
1690 request = compat_urllib_request.Request(url)
1692 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1694 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1697 # Extract video identifiers
1699 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1700 if mobj.group(1) not in ids_in_page:
1701 ids_in_page.append(mobj.group(1))
1702 video_ids.extend(ids_in_page)
1704 if self._MORE_PAGES_INDICATOR not in page:
1706 pagenum = pagenum + 1
1708 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1710 for id in video_ids:
1711 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1715 class YoutubeUserIE(InfoExtractor):
1716 """Information Extractor for YouTube users."""
1718 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1719 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1720 _GDATA_PAGE_SIZE = 50
1721 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1722 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1723 IE_NAME = u'youtube:user'
1725 def __init__(self, downloader=None):
1726 InfoExtractor.__init__(self, downloader)
1728 def report_download_page(self, username, start_index):
1729 """Report attempt to download user page."""
1730 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1731 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1733 def _real_extract(self, url):
1735 mobj = re.match(self._VALID_URL, url)
1737 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1740 username = mobj.group(1)
1742 # Download video ids using YouTube Data API. Result size per
1743 # query is limited (currently to 50 videos) so we need to query
1744 # page by page until there are no video ids - it means we got
1751 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1752 self.report_download_page(username, start_index)
1754 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1757 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1758 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1759 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1762 # Extract video identifiers
1765 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1766 if mobj.group(1) not in ids_in_page:
1767 ids_in_page.append(mobj.group(1))
1769 video_ids.extend(ids_in_page)
1771 # A little optimization - if current page is not
1772 # "full", ie. does not contain PAGE_SIZE video ids then
1773 # we can assume that this page is the last one - there
1774 # are no more ids on further pages - no need to query
1777 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1782 all_ids_count = len(video_ids)
1783 playliststart = self._downloader.params.get('playliststart', 1) - 1
1784 playlistend = self._downloader.params.get('playlistend', -1)
1786 if playlistend == -1:
1787 video_ids = video_ids[playliststart:]
1789 video_ids = video_ids[playliststart:playlistend]
1791 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1792 (username, all_ids_count, len(video_ids)))
1794 for video_id in video_ids:
1795 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1798 class BlipTVUserIE(InfoExtractor):
1799 """Information Extractor for blip.tv users."""
1801 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1803 IE_NAME = u'blip.tv:user'
1805 def __init__(self, downloader=None):
1806 InfoExtractor.__init__(self, downloader)
1808 def report_download_page(self, username, pagenum):
1809 """Report attempt to download user page."""
1810 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1811 (self.IE_NAME, username, pagenum))
1813 def _real_extract(self, url):
1815 mobj = re.match(self._VALID_URL, url)
1817 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1820 username = mobj.group(1)
1822 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1824 request = compat_urllib_request.Request(url)
1827 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1828 mobj = re.search(r'data-users-id="([^"]+)"', page)
1829 page_base = page_base % mobj.group(1)
1830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1831 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1835 # Download video ids using BlipTV Ajax calls. Result size per
1836 # query is limited (currently to 12 videos) so we need to query
1837 # page by page until there are no video ids - it means we got
1844 self.report_download_page(username, pagenum)
1846 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1849 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1851 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854 # Extract video identifiers
1857 for mobj in re.finditer(r'href="/([^"]+)"', page):
1858 if mobj.group(1) not in ids_in_page:
1859 ids_in_page.append(unescapeHTML(mobj.group(1)))
1861 video_ids.extend(ids_in_page)
1863 # A little optimization - if current page is not
1864 # "full", ie. does not contain PAGE_SIZE video ids then
1865 # we can assume that this page is the last one - there
1866 # are no more ids on further pages - no need to query
1869 if len(ids_in_page) < self._PAGE_SIZE:
1874 all_ids_count = len(video_ids)
1875 playliststart = self._downloader.params.get('playliststart', 1) - 1
1876 playlistend = self._downloader.params.get('playlistend', -1)
1878 if playlistend == -1:
1879 video_ids = video_ids[playliststart:]
1881 video_ids = video_ids[playliststart:playlistend]
1883 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1884 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1886 for video_id in video_ids:
1887 self._downloader.download([u'http://blip.tv/'+video_id])
1890 class DepositFilesIE(InfoExtractor):
1891 """Information extractor for depositfiles.com"""
1893 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1894 IE_NAME = u'DepositFiles'
1896 def __init__(self, downloader=None):
1897 InfoExtractor.__init__(self, downloader)
1899 def report_download_webpage(self, file_id):
1900 """Report webpage download."""
1901 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1903 def report_extraction(self, file_id):
1904 """Report information extraction."""
1905 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1907 def _real_extract(self, url):
1908 file_id = url.split('/')[-1]
1909 # Rebuild url in english locale
1910 url = 'http://depositfiles.com/en/files/' + file_id
1912 # Retrieve file webpage with 'Free download' button pressed
1913 free_download_indication = { 'gateway_result' : '1' }
1914 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1916 self.report_download_webpage(file_id)
1917 webpage = compat_urllib_request.urlopen(request).read()
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1922 # Search for the real file URL
1923 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1924 if (mobj is None) or (mobj.group(1) is None):
1925 # Try to figure out reason of the error.
1926 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1927 if (mobj is not None) and (mobj.group(1) is not None):
1928 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1929 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1931 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1934 file_url = mobj.group(1)
1935 file_extension = os.path.splitext(file_url)[1][1:]
1937 # Search for file title
1938 mobj = re.search(r'<b title="(.*?)">', webpage)
1940 self._downloader.trouble(u'ERROR: unable to extract title')
1942 file_title = mobj.group(1).decode('utf-8')
1945 'id': file_id.decode('utf-8'),
1946 'url': file_url.decode('utf-8'),
1948 'upload_date': None,
1949 'title': file_title,
1950 'ext': file_extension.decode('utf-8'),
1954 class FacebookIE(InfoExtractor):
1955 """Information Extractor for Facebook"""
1958 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1959 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1960 _NETRC_MACHINE = 'facebook'
1961 _available_formats = ['video', 'highqual', 'lowqual']
1962 _video_extensions = {
1967 IE_NAME = u'facebook'
1969 def __init__(self, downloader=None):
1970 InfoExtractor.__init__(self, downloader)
1972 def _reporter(self, message):
1973 """Add header and report message."""
1974 self._downloader.to_screen(u'[facebook] %s' % message)
1976 def report_login(self):
1977 """Report attempt to log in."""
1978 self._reporter(u'Logging in')
1980 def report_video_webpage_download(self, video_id):
1981 """Report attempt to download video webpage."""
1982 self._reporter(u'%s: Downloading video webpage' % video_id)
1984 def report_information_extraction(self, video_id):
1985 """Report attempt to extract video information."""
1986 self._reporter(u'%s: Extracting video information' % video_id)
1988 def _parse_page(self, video_webpage):
1989 """Extract video information from page"""
1991 data = {'title': r'\("video_title", "(.*?)"\)',
1992 'description': r'<div class="datawrap">(.*?)</div>',
1993 'owner': r'\("video_owner_name", "(.*?)"\)',
1994 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1997 for piece in data.keys():
1998 mobj = re.search(data[piece], video_webpage)
1999 if mobj is not None:
2000 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2004 for fmt in self._available_formats:
2005 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2006 if mobj is not None:
2007 # URL is in a Javascript segment inside an escaped Unicode format within
2008 # the generally utf-8 page
2009 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2010 video_info['video_urls'] = video_urls
2014 def _real_initialize(self):
2015 if self._downloader is None:
2020 downloader_params = self._downloader.params
2022 # Attempt to use provided username and password or .netrc data
2023 if downloader_params.get('username', None) is not None:
2024 useremail = downloader_params['username']
2025 password = downloader_params['password']
2026 elif downloader_params.get('usenetrc', False):
2028 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2029 if info is not None:
2033 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2034 except (IOError, netrc.NetrcParseError) as err:
2035 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2038 if useremail is None:
2047 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2050 login_results = compat_urllib_request.urlopen(request).read()
2051 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2052 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2055 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2058 def _real_extract(self, url):
2059 mobj = re.match(self._VALID_URL, url)
2061 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2063 video_id = mobj.group('ID')
2066 self.report_video_webpage_download(video_id)
2067 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2069 page = compat_urllib_request.urlopen(request)
2070 video_webpage = page.read()
2071 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2075 # Start extracting information
2076 self.report_information_extraction(video_id)
2078 # Extract information
2079 video_info = self._parse_page(video_webpage)
2082 if 'owner' not in video_info:
2083 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2085 video_uploader = video_info['owner']
2088 if 'title' not in video_info:
2089 self._downloader.trouble(u'ERROR: unable to extract video title')
2091 video_title = video_info['title']
2092 video_title = video_title.decode('utf-8')
2095 if 'thumbnail' not in video_info:
2096 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2097 video_thumbnail = ''
2099 video_thumbnail = video_info['thumbnail']
2103 if 'upload_date' in video_info:
2104 upload_time = video_info['upload_date']
2105 timetuple = email.utils.parsedate_tz(upload_time)
2106 if timetuple is not None:
2108 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2113 video_description = video_info.get('description', 'No description available.')
2115 url_map = video_info['video_urls']
2116 if len(url_map.keys()) > 0:
2117 # Decide which formats to download
2118 req_format = self._downloader.params.get('format', None)
2119 format_limit = self._downloader.params.get('format_limit', None)
2121 if format_limit is not None and format_limit in self._available_formats:
2122 format_list = self._available_formats[self._available_formats.index(format_limit):]
2124 format_list = self._available_formats
2125 existing_formats = [x for x in format_list if x in url_map]
2126 if len(existing_formats) == 0:
2127 self._downloader.trouble(u'ERROR: no known formats available for video')
2129 if req_format is None:
2130 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2131 elif req_format == 'worst':
2132 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2133 elif req_format == '-1':
2134 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2137 if req_format not in url_map:
2138 self._downloader.trouble(u'ERROR: requested format not available')
2140 video_url_list = [(req_format, url_map[req_format])] # Specific format
2143 for format_param, video_real_url in video_url_list:
2145 video_extension = self._video_extensions.get(format_param, 'mp4')
2148 'id': video_id.decode('utf-8'),
2149 'url': video_real_url.decode('utf-8'),
2150 'uploader': video_uploader.decode('utf-8'),
2151 'upload_date': upload_date,
2152 'title': video_title,
2153 'ext': video_extension.decode('utf-8'),
2154 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2155 'thumbnail': video_thumbnail.decode('utf-8'),
2156 'description': video_description.decode('utf-8'),
2160 class BlipTVIE(InfoExtractor):
2161 """Information extractor for blip.tv"""
2163 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2164 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2165 IE_NAME = u'blip.tv'
2167 def report_extraction(self, file_id):
2168 """Report information extraction."""
2169 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2171 def report_direct_download(self, title):
2172 """Report information extraction."""
2173 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2175 def _real_extract(self, url):
2176 mobj = re.match(self._VALID_URL, url)
2178 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2185 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186 request = compat_urllib_request.Request(json_url)
2187 self.report_extraction(mobj.group(1))
2190 urlh = compat_urllib_request.urlopen(request)
2191 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2192 basename = url.split('/')[-1]
2193 title,ext = os.path.splitext(basename)
2194 title = title.decode('UTF-8')
2195 ext = ext.replace('.', '')
2196 self.report_direct_download(title)
2201 'upload_date': None,
2206 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2207 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2209 if info is None: # Regular URL
2211 json_code_bytes = urlh.read()
2212 json_code = json_code_bytes.decode('utf-8')
2213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2218 json_data = json.loads(json_code)
2219 if 'Post' in json_data:
2220 data = json_data['Post']
2224 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225 video_url = data['media']['url']
2226 umobj = re.match(self._URL_EXT, video_url)
2228 raise ValueError('Can not determine filename extension')
2229 ext = umobj.group(1)
2232 'id': data['item_id'],
2234 'uploader': data['display_name'],
2235 'upload_date': upload_date,
2236 'title': data['title'],
2238 'format': data['media']['mimeType'],
2239 'thumbnail': data['thumbnailUrl'],
2240 'description': data['description'],
2241 'player_url': data['embedUrl']
2243 except (ValueError,KeyError) as err:
2244 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2247 std_headers['User-Agent'] = 'iTunes/10.6.1'
2251 class MyVideoIE(InfoExtractor):
2252 """Information Extractor for myvideo.de."""
2254 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255 IE_NAME = u'myvideo'
2257 def __init__(self, downloader=None):
2258 InfoExtractor.__init__(self, downloader)
2260 def report_download_webpage(self, video_id):
2261 """Report webpage download."""
2262 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2264 def report_extraction(self, video_id):
2265 """Report information extraction."""
2266 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2268 def _real_extract(self,url):
2269 mobj = re.match(self._VALID_URL, url)
2271 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2274 video_id = mobj.group(1)
2277 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2279 self.report_download_webpage(video_id)
2280 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2281 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2282 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2285 self.report_extraction(video_id)
2286 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2289 self._downloader.trouble(u'ERROR: unable to extract media URL')
2291 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2293 mobj = re.search('<title>([^<]+)</title>', webpage)
2295 self._downloader.trouble(u'ERROR: unable to extract title')
2298 video_title = mobj.group(1)
2304 'upload_date': None,
2305 'title': video_title,
2309 class ComedyCentralIE(InfoExtractor):
2310 """Information extractor for The Daily Show and Colbert Report """
2312 # urls can be abbreviations like :thedailyshow or :colbert
2313 # urls for episodes like:
2314 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2315 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2316 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2317 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2318 |(https?://)?(www\.)?
2319 (?P<showname>thedailyshow|colbertnation)\.com/
2320 (full-episodes/(?P<episode>.*)|
2322 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2323 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2325 IE_NAME = u'comedycentral'
2327 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2329 _video_extensions = {
2337 _video_dimensions = {
2346 def suitable(self, url):
2347 """Receives a URL and returns True if suitable for this IE."""
2348 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2350 def report_extraction(self, episode_id):
2351 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2353 def report_config_download(self, episode_id):
2354 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2356 def report_index_download(self, episode_id):
2357 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2359 def report_player_url(self, episode_id):
2360 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2363 def _print_formats(self, formats):
2364 print('Available formats:')
2366 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2369 def _real_extract(self, url):
2370 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2372 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2375 if mobj.group('shortname'):
2376 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2377 url = u'http://www.thedailyshow.com/full-episodes/'
2379 url = u'http://www.colbertnation.com/full-episodes/'
2380 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2381 assert mobj is not None
2383 if mobj.group('clip'):
2384 if mobj.group('showname') == 'thedailyshow':
2385 epTitle = mobj.group('tdstitle')
2387 epTitle = mobj.group('cntitle')
2390 dlNewest = not mobj.group('episode')
2392 epTitle = mobj.group('showname')
2394 epTitle = mobj.group('episode')
2396 req = compat_urllib_request.Request(url)
2397 self.report_extraction(epTitle)
2399 htmlHandle = compat_urllib_request.urlopen(req)
2400 html = htmlHandle.read()
2401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2402 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2405 url = htmlHandle.geturl()
2406 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2408 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2410 if mobj.group('episode') == '':
2411 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2413 epTitle = mobj.group('episode')
2415 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2417 if len(mMovieParams) == 0:
2418 # The Colbert Report embeds the information in a without
2419 # a URL prefix; so extract the alternate reference
2420 # and then add the URL prefix manually.
2422 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2423 if len(altMovieParams) == 0:
2424 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2427 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2429 playerUrl_raw = mMovieParams[0][0]
2430 self.report_player_url(epTitle)
2432 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2433 playerUrl = urlHandle.geturl()
2434 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2435 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2438 uri = mMovieParams[0][1]
2439 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2440 self.report_index_download(epTitle)
2442 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2449 idoc = xml.etree.ElementTree.fromstring(indexXml)
2450 itemEls = idoc.findall('.//item')
2451 for itemEl in itemEls:
2452 mediaId = itemEl.findall('./guid')[0].text
2453 shortMediaId = mediaId.split(':')[-1]
2454 showId = mediaId.split(':')[-2].replace('.com', '')
2455 officialTitle = itemEl.findall('./title')[0].text
2456 officialDate = itemEl.findall('./pubDate')[0].text
2458 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2459 compat_urllib_parse.urlencode({'uri': mediaId}))
2460 configReq = compat_urllib_request.Request(configUrl)
2461 self.report_config_download(epTitle)
2463 configXml = compat_urllib_request.urlopen(configReq).read()
2464 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2465 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2468 cdoc = xml.etree.ElementTree.fromstring(configXml)
2470 for rendition in cdoc.findall('.//rendition'):
2471 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2475 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2478 if self._downloader.params.get('listformats', None):
2479 self._print_formats([i[0] for i in turls])
2482 # For now, just pick the highest bitrate
2483 format,video_url = turls[-1]
2485 # Get the format arg from the arg stream
2486 req_format = self._downloader.params.get('format', None)
2488 # Select format if we can find one
2491 format, video_url = f, v
2494 # Patch to download from alternative CDN, which does not
2495 # break on current RTMPDump builds
2496 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2497 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2499 if video_url.startswith(broken_cdn):
2500 video_url = video_url.replace(broken_cdn, better_cdn)
2502 effTitle = showId + u'-' + epTitle
2507 'upload_date': officialDate,
2512 'description': officialTitle,
2513 'player_url': None #playerUrl
2516 results.append(info)
2521 class EscapistIE(InfoExtractor):
2522 """Information extractor for The Escapist """
2524 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2525 IE_NAME = u'escapist'
2527 def report_extraction(self, showName):
2528 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2530 def report_config_download(self, showName):
2531 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2533 def _real_extract(self, url):
2534 mobj = re.match(self._VALID_URL, url)
2536 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2538 showName = mobj.group('showname')
2539 videoId = mobj.group('episode')
2541 self.report_extraction(showName)
2543 webPage = compat_urllib_request.urlopen(url)
2544 webPageBytes = webPage.read()
2545 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2546 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2548 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2551 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2552 description = unescapeHTML(descMatch.group(1))
2553 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2554 imgUrl = unescapeHTML(imgMatch.group(1))
2555 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2556 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2557 configUrlMatch = re.search('config=(.*)$', playerUrl)
2558 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2560 self.report_config_download(showName)
2562 configJSON = compat_urllib_request.urlopen(configUrl)
2563 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2564 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2569 # Technically, it's JavaScript, not JSON
2570 configJSON = configJSON.replace("'", '"')
2573 config = json.loads(configJSON)
2574 except (ValueError,) as err:
2575 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2578 playlist = config['playlist']
2579 videoUrl = playlist[1]['url']
2584 'uploader': showName,
2585 'upload_date': None,
2588 'thumbnail': imgUrl,
2589 'description': description,
2590 'player_url': playerUrl,
2596 class CollegeHumorIE(InfoExtractor):
2597 """Information extractor for collegehumor.com"""
2600 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2601 IE_NAME = u'collegehumor'
2603 def report_manifest(self, video_id):
2604 """Report information extraction."""
2605 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2607 def report_extraction(self, video_id):
2608 """Report information extraction."""
2609 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2611 def _real_extract(self, url):
2612 mobj = re.match(self._VALID_URL, url)
2614 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2616 video_id = mobj.group('videoid')
2621 'upload_date': None,
2624 self.report_extraction(video_id)
2625 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2627 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2628 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2629 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2632 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2634 videoNode = mdoc.findall('./video')[0]
2635 info['description'] = videoNode.findall('./description')[0].text
2636 info['title'] = videoNode.findall('./caption')[0].text
2637 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2638 manifest_url = videoNode.findall('./file')[0].text
2640 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2643 manifest_url += '?hdcore=2.10.3'
2644 self.report_manifest(video_id)
2646 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2651 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2653 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2654 node_id = media_node.attrib['url']
2655 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2656 except IndexError as err:
2657 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2660 url_pr = compat_urllib_parse_urlparse(manifest_url)
2661 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2668 class XVideosIE(InfoExtractor):
2669 """Information extractor for xvideos.com"""
2671 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2672 IE_NAME = u'xvideos'
2674 def report_webpage(self, video_id):
2675 """Report information extraction."""
2676 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2678 def report_extraction(self, video_id):
2679 """Report information extraction."""
2680 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2682 def _real_extract(self, url):
2683 mobj = re.match(self._VALID_URL, url)
2685 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2687 video_id = mobj.group(1)
2689 self.report_webpage(video_id)
2691 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2693 webpage_bytes = compat_urllib_request.urlopen(request).read()
2694 webpage = webpage_bytes.decode('utf-8', 'replace')
2695 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2696 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2699 self.report_extraction(video_id)
2703 mobj = re.search(r'flv_url=(.+?)&', webpage)
2705 self._downloader.trouble(u'ERROR: unable to extract video url')
2707 video_url = compat_urllib_parse.unquote(mobj.group(1))
2711 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2713 self._downloader.trouble(u'ERROR: unable to extract video title')
2715 video_title = mobj.group(1)
2718 # Extract video thumbnail
2719 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2721 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2723 video_thumbnail = mobj.group(0)
2729 'upload_date': None,
2730 'title': video_title,
2732 'thumbnail': video_thumbnail,
2733 'description': None,
2739 class SoundcloudIE(InfoExtractor):
2740 """Information extractor for soundcloud.com
2741 To access the media, the uid of the song and a stream token
2742 must be extracted from the page source and the script must make
2743 a request to media.soundcloud.com/crossdomain.xml. Then
2744 the media can be grabbed by requesting from an url composed
2745 of the stream token and uid
2748 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2749 IE_NAME = u'soundcloud'
2751 def __init__(self, downloader=None):
2752 InfoExtractor.__init__(self, downloader)
2754 def report_resolve(self, video_id):
2755 """Report information extraction."""
2756 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2758 def report_extraction(self, video_id):
2759 """Report information extraction."""
2760 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2762 def _real_extract(self, url):
2763 mobj = re.match(self._VALID_URL, url)
2765 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2768 # extract uploader (which is in the url)
2769 uploader = mobj.group(1)
2770 # extract simple title (uploader + slug of song title)
2771 slug_title = mobj.group(2)
2772 simple_title = uploader + u'-' + slug_title
2774 self.report_resolve('%s/%s' % (uploader, slug_title))
2776 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2777 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2778 request = compat_urllib_request.Request(resolv_url)
2780 info_json_bytes = compat_urllib_request.urlopen(request).read()
2781 info_json = info_json_bytes.decode('utf-8')
2782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2783 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2786 info = json.loads(info_json)
2787 video_id = info['id']
2788 self.report_extraction('%s/%s' % (uploader, slug_title))
2790 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791 request = compat_urllib_request.Request(streams_url)
2793 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2794 stream_json = stream_json_bytes.decode('utf-8')
2795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2799 streams = json.loads(stream_json)
2800 mediaURL = streams['http_mp3_128_url']
2805 'uploader': info['user']['username'],
2806 'upload_date': info['created_at'],
2807 'title': info['title'],
2809 'description': info['description'],
2813 class InfoQIE(InfoExtractor):
2814 """Information extractor for infoq.com"""
2816 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2819 def report_webpage(self, video_id):
2820 """Report information extraction."""
2821 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2823 def report_extraction(self, video_id):
2824 """Report information extraction."""
2825 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2827 def _real_extract(self, url):
2828 mobj = re.match(self._VALID_URL, url)
2830 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2833 self.report_webpage(url)
2835 request = compat_urllib_request.Request(url)
2837 webpage = compat_urllib_request.urlopen(request).read()
2838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2839 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2842 self.report_extraction(url)
2846 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2848 self._downloader.trouble(u'ERROR: unable to extract video url')
2850 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2854 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2856 self._downloader.trouble(u'ERROR: unable to extract video title')
2858 video_title = mobj.group(1).decode('utf-8')
2860 # Extract description
2861 video_description = u'No description available.'
2862 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2863 if mobj is not None:
2864 video_description = mobj.group(1).decode('utf-8')
2866 video_filename = video_url.split('/')[-1]
2867 video_id, extension = video_filename.split('.')
2873 'upload_date': None,
2874 'title': video_title,
2875 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2877 'description': video_description,
2882 class MixcloudIE(InfoExtractor):
2883 """Information extractor for www.mixcloud.com"""
2885 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2886 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2887 IE_NAME = u'mixcloud'
2889 def __init__(self, downloader=None):
2890 InfoExtractor.__init__(self, downloader)
2892 def report_download_json(self, file_id):
2893 """Report JSON download."""
2894 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2896 def report_extraction(self, file_id):
2897 """Report information extraction."""
2898 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2900 def get_urls(self, jsonData, fmt, bitrate='best'):
2901 """Get urls from 'audio_formats' section in json"""
2904 bitrate_list = jsonData[fmt]
2905 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2906 bitrate = max(bitrate_list) # select highest
2908 url_list = jsonData[fmt][bitrate]
2909 except TypeError: # we have no bitrate info.
2910 url_list = jsonData[fmt]
2913 def check_urls(self, url_list):
2914 """Returns 1st active url from list"""
2915 for url in url_list:
2917 compat_urllib_request.urlopen(url)
2919 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924 def _print_formats(self, formats):
2925 print('Available formats:')
2926 for fmt in formats.keys():
2927 for b in formats[fmt]:
2929 ext = formats[fmt][b][0]
2930 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2931 except TypeError: # we have no bitrate info
2932 ext = formats[fmt][0]
2933 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2936 def _real_extract(self, url):
2937 mobj = re.match(self._VALID_URL, url)
2939 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2941 # extract uploader & filename from url
2942 uploader = mobj.group(1).decode('utf-8')
2943 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2945 # construct API request
2946 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2947 # retrieve .json file with links to files
2948 request = compat_urllib_request.Request(file_url)
2950 self.report_download_json(file_url)
2951 jsonData = compat_urllib_request.urlopen(request).read()
2952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2953 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2957 json_data = json.loads(jsonData)
2958 player_url = json_data['player_swf_url']
2959 formats = dict(json_data['audio_formats'])
2961 req_format = self._downloader.params.get('format', None)
2964 if self._downloader.params.get('listformats', None):
2965 self._print_formats(formats)
2968 if req_format is None or req_format == 'best':
2969 for format_param in formats.keys():
2970 url_list = self.get_urls(formats, format_param)
2972 file_url = self.check_urls(url_list)
2973 if file_url is not None:
2976 if req_format not in formats.keys():
2977 self._downloader.trouble(u'ERROR: format is not available')
2980 url_list = self.get_urls(formats, req_format)
2981 file_url = self.check_urls(url_list)
2982 format_param = req_format
2985 'id': file_id.decode('utf-8'),
2986 'url': file_url.decode('utf-8'),
2987 'uploader': uploader.decode('utf-8'),
2988 'upload_date': None,
2989 'title': json_data['name'],
2990 'ext': file_url.split('.')[-1].decode('utf-8'),
2991 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2992 'thumbnail': json_data['thumbnail_url'],
2993 'description': json_data['description'],
2994 'player_url': player_url.decode('utf-8'),
2997 class StanfordOpenClassroomIE(InfoExtractor):
2998 """Information extractor for Stanford's Open ClassRoom"""
3000 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3001 IE_NAME = u'stanfordoc'
3003 def report_download_webpage(self, objid):
3004 """Report information extraction."""
3005 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3007 def report_extraction(self, video_id):
3008 """Report information extraction."""
3009 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3011 def _real_extract(self, url):
3012 mobj = re.match(self._VALID_URL, url)
3014 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3017 if mobj.group('course') and mobj.group('video'): # A specific video
3018 course = mobj.group('course')
3019 video = mobj.group('video')
3021 'id': course + '_' + video,
3023 'upload_date': None,
3026 self.report_extraction(info['id'])
3027 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3028 xmlUrl = baseUrl + video + '.xml'
3030 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3031 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3034 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3036 info['title'] = mdoc.findall('./title')[0].text
3037 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3039 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3041 info['ext'] = info['url'].rpartition('.')[2]
3043 elif mobj.group('course'): # A course page
3044 course = mobj.group('course')
3049 'upload_date': None,
3052 self.report_download_webpage(info['id'])
3054 coursepage = compat_urllib_request.urlopen(url).read()
3055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3056 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3059 m = re.search('<h1>([^<]+)</h1>', coursepage)
3061 info['title'] = unescapeHTML(m.group(1))
3063 info['title'] = info['id']
3065 m = re.search('<description>([^<]+)</description>', coursepage)
3067 info['description'] = unescapeHTML(m.group(1))
3069 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3072 'type': 'reference',
3073 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3077 for entry in info['list']:
3078 assert entry['type'] == 'reference'
3079 results += self.extract(entry['url'])
3084 'id': 'Stanford OpenClassroom',
3087 'upload_date': None,
3090 self.report_download_webpage(info['id'])
3091 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3093 rootpage = compat_urllib_request.urlopen(rootURL).read()
3094 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3095 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3098 info['title'] = info['id']
3100 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3103 'type': 'reference',
3104 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3109 for entry in info['list']:
3110 assert entry['type'] == 'reference'
3111 results += self.extract(entry['url'])
3114 class MTVIE(InfoExtractor):
3115 """Information extractor for MTV.com"""
3117 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3120 def report_webpage(self, video_id):
3121 """Report information extraction."""
3122 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3124 def report_extraction(self, video_id):
3125 """Report information extraction."""
3126 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3128 def _real_extract(self, url):
3129 mobj = re.match(self._VALID_URL, url)
3131 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3133 if not mobj.group('proto'):
3134 url = 'http://' + url
3135 video_id = mobj.group('videoid')
3136 self.report_webpage(video_id)
3138 request = compat_urllib_request.Request(url)
3140 webpage = compat_urllib_request.urlopen(request).read()
3141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3145 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3147 self._downloader.trouble(u'ERROR: unable to extract song name')
3149 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3150 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3152 self._downloader.trouble(u'ERROR: unable to extract performer')
3154 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3155 video_title = performer + ' - ' + song_name
3157 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3159 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3161 mtvn_uri = mobj.group(1)
3163 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3165 self._downloader.trouble(u'ERROR: unable to extract content id')
3167 content_id = mobj.group(1)
3169 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3170 self.report_extraction(video_id)
3171 request = compat_urllib_request.Request(videogen_url)
3173 metadataXml = compat_urllib_request.urlopen(request).read()
3174 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3175 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3178 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3179 renditions = mdoc.findall('.//rendition')
3181 # For now, always pick the highest quality.
3182 rendition = renditions[-1]
3185 _,_,ext = rendition.attrib['type'].partition('/')
3186 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3187 video_url = rendition.find('./src').text
3189 self._downloader.trouble('Invalid rendition field.')
3195 'uploader': performer,
3196 'upload_date': None,
3197 'title': video_title,
3205 class YoukuIE(InfoExtractor):
3207 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3210 def __init__(self, downloader=None):
3211 InfoExtractor.__init__(self, downloader)
3213 def report_download_webpage(self, file_id):
3214 """Report webpage download."""
3215 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3217 def report_extraction(self, file_id):
3218 """Report information extraction."""
3219 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3222 nowTime = int(time.time() * 1000)
3223 random1 = random.randint(1000,1998)
3224 random2 = random.randint(1000,9999)
3226 return "%d%d%d" %(nowTime,random1,random2)
3228 def _get_file_ID_mix_string(self, seed):
3230 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3232 for i in range(len(source)):
3233 seed = (seed * 211 + 30031 ) % 65536
3234 index = math.floor(seed / 65536 * len(source) )
3235 mixed.append(source[int(index)])
3236 source.remove(source[int(index)])
3237 #return ''.join(mixed)
3240 def _get_file_id(self, fileId, seed):
3241 mixed = self._get_file_ID_mix_string(seed)
3242 ids = fileId.split('*')
3246 realId.append(mixed[int(ch)])
3247 return ''.join(realId)
3249 def _real_extract(self, url):
3250 mobj = re.match(self._VALID_URL, url)
3252 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3254 video_id = mobj.group('ID')
3256 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3258 request = compat_urllib_request.Request(info_url, None, std_headers)
3260 self.report_download_webpage(video_id)
3261 jsondata = compat_urllib_request.urlopen(request).read()
3262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3263 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3266 self.report_extraction(video_id)
3268 jsonstr = jsondata.decode('utf-8')
3269 config = json.loads(jsonstr)
3271 video_title = config['data'][0]['title']
3272 seed = config['data'][0]['seed']
3274 format = self._downloader.params.get('format', None)
3275 supported_format = config['data'][0]['streamfileids'].keys()
3277 if format is None or format == 'best':
3278 if 'hd2' in supported_format:
3283 elif format == 'worst':
3291 fileid = config['data'][0]['streamfileids'][format]
3292 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3293 except (UnicodeDecodeError, ValueError, KeyError):
3294 self._downloader.trouble(u'ERROR: unable to extract info section')
3298 sid = self._gen_sid()
3299 fileid = self._get_file_id(fileid, seed)
3301 #column 8,9 of fileid represent the segment number
3302 #fileid[7:9] should be changed
3303 for index, key in enumerate(keys):
3305 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3306 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3309 'id': '%s_part%02d' % (video_id, index),
3310 'url': download_url,
3312 'upload_date': None,
3313 'title': video_title,
3316 files_info.append(info)
3321 class XNXXIE(InfoExtractor):
3322 """Information extractor for xnxx.com"""
3324 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3326 VIDEO_URL_RE = r'flv_url=(.*?)&'
3327 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3328 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3330 def report_webpage(self, video_id):
3331 """Report information extraction"""
3332 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3334 def report_extraction(self, video_id):
3335 """Report information extraction"""
3336 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3338 def _real_extract(self, url):
3339 mobj = re.match(self._VALID_URL, url)
3341 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3343 video_id = mobj.group(1)
3345 self.report_webpage(video_id)
3347 # Get webpage content
3349 webpage_bytes = compat_urllib_request.urlopen(url).read()
3350 webpage = webpage_bytes.decode('utf-8')
3351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3352 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3355 result = re.search(self.VIDEO_URL_RE, webpage)
3357 self._downloader.trouble(u'ERROR: unable to extract video url')
3359 video_url = compat_urllib_parse.unquote(result.group(1))
3361 result = re.search(self.VIDEO_TITLE_RE, webpage)
3363 self._downloader.trouble(u'ERROR: unable to extract video title')
3365 video_title = result.group(1)
3367 result = re.search(self.VIDEO_THUMB_RE, webpage)
3369 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3371 video_thumbnail = result.group(1)
3377 'upload_date': None,
3378 'title': video_title,
3380 'thumbnail': video_thumbnail,
3381 'description': None,
3385 class GooglePlusIE(InfoExtractor):
3386 """Information extractor for plus.google.com."""
3388 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3389 IE_NAME = u'plus.google'
3391 def __init__(self, downloader=None):
3392 InfoExtractor.__init__(self, downloader)
3394 def report_extract_entry(self, url):
3395 """Report downloading extry"""
3396 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3398 def report_date(self, upload_date):
3399 """Report downloading extry"""
3400 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3402 def report_uploader(self, uploader):
3403 """Report downloading extry"""
3404 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3406 def report_title(self, video_title):
3407 """Report downloading extry"""
3408 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3410 def report_extract_vid_page(self, video_page):
3411 """Report information extraction."""
3412 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3414 def _real_extract(self, url):
3415 # Extract id from URL
3416 mobj = re.match(self._VALID_URL, url)
3418 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3421 post_url = mobj.group(0)
3422 video_id = mobj.group(1)
3424 video_extension = 'flv'
3426 # Step 1, Retrieve post webpage to extract further information
3427 self.report_extract_entry(post_url)
3428 request = compat_urllib_request.Request(post_url)
3430 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3432 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3435 # Extract update date
3437 pattern = 'title="Timestamp">(.*?)</a>'
3438 mobj = re.search(pattern, webpage)
3440 upload_date = mobj.group(1)
3441 # Convert timestring to a format suitable for filename
3442 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3443 upload_date = upload_date.strftime('%Y%m%d')
3444 self.report_date(upload_date)
3448 pattern = r'rel\="author".*?>(.*?)</a>'
3449 mobj = re.search(pattern, webpage)
3451 uploader = mobj.group(1)
3452 self.report_uploader(uploader)
3455 # Get the first line for title
3457 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3458 mobj = re.search(pattern, webpage)
3460 video_title = mobj.group(1)
3461 self.report_title(video_title)
3463 # Step 2, Stimulate clicking the image box to launch video
3464 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3465 mobj = re.search(pattern, webpage)
3467 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3469 video_page = mobj.group(1)
3470 request = compat_urllib_request.Request(video_page)
3472 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3474 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3476 self.report_extract_vid_page(video_page)
3479 # Extract video links on video page
3480 """Extract video links of all sizes"""
3481 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3482 mobj = re.findall(pattern, webpage)
3484 self._downloader.trouble(u'ERROR: unable to extract video links')
3486 # Sort in resolution
3487 links = sorted(mobj)
3489 # Choose the lowest of the sort, i.e. highest resolution
3490 video_url = links[-1]
3491 # Only get the url. The resolution part in the tuple has no use anymore
3492 video_url = video_url[-1]
3493 # Treat escaped \u0026 style hex
3495 video_url = video_url.decode("unicode_escape")
3496 except AttributeError: # Python 3
3497 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3503 'uploader': uploader,
3504 'upload_date': upload_date,
3505 'title': video_title,
3506 'ext': video_extension,
3509 class NBAIE(InfoExtractor):
3510 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3513 def report_extraction(self, video_id):
3514 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3516 def _real_extract(self, url):
3517 mobj = re.match(self._VALID_URL, url)
3519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3522 video_id = mobj.group(1)
3523 if video_id.endswith('/index.html'):
3524 video_id = video_id[:-len('/index.html')]
3526 self.report_extraction(video_id)
3528 urlh = compat_urllib_request.urlopen(url)
3529 webpage_bytes = urlh.read()
3530 webpage = webpage_bytes.decode('utf-8', 'ignore')
3531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3532 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3535 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3536 def _findProp(rexp, default=None):
3537 m = re.search(rexp, webpage)
3539 return unescapeHTML(m.group(1))
3543 shortened_video_id = video_id.rpartition('/')[2]
3544 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3546 'id': shortened_video_id,
3550 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3551 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3555 class JustinTVIE(InfoExtractor):
3556 """Information extractor for justin.tv and twitch.tv"""
3557 # TODO: One broadcast may be split into multiple videos. The key
3558 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3559 # starts at 1 and increases. Can we treat all parts as one video?
3561 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3562 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3563 _JUSTIN_PAGE_LIMIT = 100
3564 IE_NAME = u'justin.tv'
3566 def report_extraction(self, file_id):
3567 """Report information extraction."""
3568 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3570 def report_download_page(self, channel, offset):
3571 """Report attempt to download a single page of videos."""
3572 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3573 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3575 # Return count of items, list of *valid* items
3576 def _parse_page(self, url):
3578 urlh = compat_urllib_request.urlopen(url)
3579 webpage_bytes = urlh.read()
3580 webpage = webpage_bytes.decode('utf-8', 'ignore')
3581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3582 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3585 response = json.loads(webpage)
3587 for clip in response:
3588 video_url = clip['video_file_url']
3590 video_extension = os.path.splitext(video_url)[1][1:]
3591 video_date = re.sub('-', '', clip['created_on'][:10])
3595 'title': clip['title'],
3596 'uploader': clip.get('user_id', clip.get('channel_id')),
3597 'upload_date': video_date,
3598 'ext': video_extension,
3600 return (len(response), info)
3602 def _real_extract(self, url):
3603 mobj = re.match(self._VALID_URL, url)
3605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3608 api = 'http://api.justin.tv'
3609 video_id = mobj.group(mobj.lastindex)
3611 if mobj.lastindex == 1:
3613 api += '/channel/archives/%s.json'
3615 api += '/clip/show/%s.json'
3616 api = api % (video_id,)
3618 self.report_extraction(video_id)
3622 limit = self._JUSTIN_PAGE_LIMIT
3625 self.report_download_page(video_id, offset)
3626 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3627 page_count, page_info = self._parse_page(page_url)
3628 info.extend(page_info)
3629 if not paged or page_count != limit: