2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self).__name__[:-2]
108 class YoutubeIE(InfoExtractor):
109 """Information extractor for youtube.com."""
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
131 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE = 'youtube'
136 # Listed in order of quality
137 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions = {
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
151 _video_dimensions = {
169 def suitable(self, url):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
173 def report_lang(self):
174 """Report attempt to set language."""
175 self._downloader.to_screen(u'[youtube] Setting language')
177 def report_login(self):
178 """Report attempt to log in."""
179 self._downloader.to_screen(u'[youtube] Logging in')
181 def report_age_confirmation(self):
182 """Report attempt to confirm age."""
183 self._downloader.to_screen(u'[youtube] Confirming age')
185 def report_video_webpage_download(self, video_id):
186 """Report attempt to download video webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
189 def report_video_info_webpage_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
193 def report_video_subtitles_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
197 def report_information_extraction(self, video_id):
198 """Report attempt to extract video information."""
199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
201 def report_unavailable_format(self, video_id, format):
202 """Report extracted video URL."""
203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
205 def report_rtmp_download(self):
206 """Indicate the download will use the RTMP protocol."""
207 self._downloader.to_screen(u'[youtube] RTMP download detected')
209 def _closed_captions_xml_to_srt(self, xml_string):
211 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212 # TODO parse xml instead of regex
213 for n, (start, dur_tag, dur, caption) in enumerate(texts):
214 if not dur: dur = '4'
216 end = start + float(dur)
217 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219 caption = unescapeHTML(caption)
220 caption = unescapeHTML(caption) # double cycle, intentional
221 srt += str(n+1) + '\n'
222 srt += start + ' --> ' + end + '\n'
223 srt += caption + '\n\n'
226 def _extract_subtitles(self, video_id):
227 self.report_video_subtitles_download(video_id)
228 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
230 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235 if not srt_lang_list:
236 return (u'WARNING: video has no closed captions', None)
237 if self._downloader.params.get('subtitleslang', False):
238 srt_lang = self._downloader.params.get('subtitleslang')
239 elif 'en' in srt_lang_list:
242 srt_lang = list(srt_lang_list.keys())[0]
243 if not srt_lang in srt_lang_list:
244 return (u'WARNING: no closed captions found in the specified language', None)
245 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
247 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
251 return (u'WARNING: unable to download video subtitles', None)
252 return (None, self._closed_captions_xml_to_srt(srt_xml))
254 def _print_formats(self, formats):
255 print('Available formats:')
257 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
259 def _real_initialize(self):
260 if self._downloader is None:
265 downloader_params = self._downloader.params
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params.get('username', None) is not None:
269 username = downloader_params['username']
270 password = downloader_params['password']
271 elif downloader_params.get('usenetrc', False):
273 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
278 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279 except (IOError, netrc.NetrcParseError) as err:
280 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
284 request = compat_urllib_request.Request(self._LANG_URL)
287 compat_urllib_request.urlopen(request).read()
288 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
292 # No authentication to be performed
298 'current_form': 'loginForm',
300 'action_login': 'Log In',
301 'username': username,
302 'password': password,
304 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
307 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
318 'action_confirm': 'Confirm',
320 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
322 self.report_age_confirmation()
323 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
328 def _real_extract(self, url):
329 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
330 mobj = re.search(self._NEXT_URL_RE, url)
332 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
334 # Extract video id from URL
335 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
337 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
339 video_id = mobj.group(2)
342 self.report_video_webpage_download(video_id)
343 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
345 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
350 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
352 # Attempt to extract SWF player URL
353 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
355 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
360 self.report_video_info_webpage_download(video_id)
361 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
362 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
363 % (video_id, el_type))
364 request = compat_urllib_request.Request(video_info_url)
366 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
367 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
368 video_info = compat_parse_qs(video_info_webpage)
369 if 'token' in video_info:
371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
374 if 'token' not in video_info:
375 if 'reason' in video_info:
376 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
378 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
381 # Check for "rental" videos
382 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
383 self._downloader.trouble(u'ERROR: "rental" videos not supported')
386 # Start extracting information
387 self.report_information_extraction(video_id)
390 if 'author' not in video_info:
391 self._downloader.trouble(u'ERROR: unable to extract uploader name')
393 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
396 video_uploader_id = None
397 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
399 video_uploader_id = mobj.group(1)
401 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
404 if 'title' not in video_info:
405 self._downloader.trouble(u'ERROR: unable to extract video title')
407 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
410 if 'thumbnail_url' not in video_info:
411 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
413 else: # don't panic if we can't find it
414 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
418 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
420 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
421 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
422 for expression in format_expressions:
424 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
429 video_description = get_element_by_id("eow-description", video_webpage)
430 if video_description:
431 video_description = clean_html(video_description)
433 video_description = ''
436 video_subtitles = None
437 if self._downloader.params.get('writesubtitles', False):
438 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
440 self._downloader.trouble(srt_error)
442 if 'length_seconds' not in video_info:
443 self._downloader.trouble(u'WARNING: unable to extract video duration')
446 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
449 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
451 # Decide which formats to download
452 req_format = self._downloader.params.get('format', None)
454 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
455 self.report_rtmp_download()
456 video_url_list = [(None, video_info['conn'][0])]
457 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
458 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
459 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
460 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
461 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
463 format_limit = self._downloader.params.get('format_limit', None)
464 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
465 if format_limit is not None and format_limit in available_formats:
466 format_list = available_formats[available_formats.index(format_limit):]
468 format_list = available_formats
469 existing_formats = [x for x in format_list if x in url_map]
470 if len(existing_formats) == 0:
471 self._downloader.trouble(u'ERROR: no known formats available for video')
473 if self._downloader.params.get('listformats', None):
474 self._print_formats(existing_formats)
476 if req_format is None or req_format == 'best':
477 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
478 elif req_format == 'worst':
479 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
480 elif req_format in ('-1', 'all'):
481 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
483 # Specific formats. We pick the first in a slash-delimeted sequence.
484 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
485 req_formats = req_format.split('/')
486 video_url_list = None
487 for rf in req_formats:
489 video_url_list = [(rf, url_map[rf])]
491 if video_url_list is None:
492 self._downloader.trouble(u'ERROR: requested format not available')
495 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
499 for format_param, video_real_url in video_url_list:
501 video_extension = self._video_extensions.get(format_param, 'flv')
503 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
504 self._video_dimensions.get(format_param, '???'))
508 'url': video_real_url,
509 'uploader': video_uploader,
510 'uploader_id': video_uploader_id,
511 'upload_date': upload_date,
512 'title': video_title,
513 'ext': video_extension,
514 'format': video_format,
515 'thumbnail': video_thumbnail,
516 'description': video_description,
517 'player_url': player_url,
518 'subtitles': video_subtitles,
519 'duration': video_duration
524 class MetacafeIE(InfoExtractor):
525 """Information Extractor for metacafe.com."""
527 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
528 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
529 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
530 IE_NAME = u'metacafe'
532 def __init__(self, downloader=None):
533 InfoExtractor.__init__(self, downloader)
535 def report_disclaimer(self):
536 """Report disclaimer retrieval."""
537 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
539 def report_age_confirmation(self):
540 """Report attempt to confirm age."""
541 self._downloader.to_screen(u'[metacafe] Confirming age')
543 def report_download_webpage(self, video_id):
544 """Report webpage download."""
545 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
547 def report_extraction(self, video_id):
548 """Report information extraction."""
549 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
551 def _real_initialize(self):
552 # Retrieve disclaimer
553 request = compat_urllib_request.Request(self._DISCLAIMER)
555 self.report_disclaimer()
556 disclaimer = compat_urllib_request.urlopen(request).read()
557 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
558 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
564 'submit': "Continue - I'm over 18",
566 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
568 self.report_age_confirmation()
569 disclaimer = compat_urllib_request.urlopen(request).read()
570 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
571 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
574 def _real_extract(self, url):
575 # Extract id and simplified title from URL
576 mobj = re.match(self._VALID_URL, url)
578 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
581 video_id = mobj.group(1)
583 # Check if video comes from YouTube
584 mobj2 = re.match(r'^yt-(.*)$', video_id)
585 if mobj2 is not None:
586 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
589 # Retrieve video webpage to extract further information
590 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
592 self.report_download_webpage(video_id)
593 webpage = compat_urllib_request.urlopen(request).read()
594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
598 # Extract URL, uploader and title from webpage
599 self.report_extraction(video_id)
600 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
602 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
603 video_extension = mediaURL[-3:]
605 # Extract gdaKey if available
606 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
610 gdaKey = mobj.group(1)
611 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
613 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
615 self._downloader.trouble(u'ERROR: unable to extract media URL')
617 vardict = compat_parse_qs(mobj.group(1))
618 if 'mediaData' not in vardict:
619 self._downloader.trouble(u'ERROR: unable to extract media URL')
621 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
623 self._downloader.trouble(u'ERROR: unable to extract media URL')
625 mediaURL = mobj.group(1).replace('\\/', '/')
626 video_extension = mediaURL[-3:]
627 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
629 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
631 self._downloader.trouble(u'ERROR: unable to extract title')
633 video_title = mobj.group(1).decode('utf-8')
635 mobj = re.search(r'submitter=(.*?);', webpage)
637 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
639 video_uploader = mobj.group(1)
642 'id': video_id.decode('utf-8'),
643 'url': video_url.decode('utf-8'),
644 'uploader': video_uploader.decode('utf-8'),
646 'title': video_title,
647 'ext': video_extension.decode('utf-8'),
651 class DailymotionIE(InfoExtractor):
652 """Information Extractor for Dailymotion"""
654 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
655 IE_NAME = u'dailymotion'
657 def __init__(self, downloader=None):
658 InfoExtractor.__init__(self, downloader)
660 def report_download_webpage(self, video_id):
661 """Report webpage download."""
662 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
664 def report_extraction(self, video_id):
665 """Report information extraction."""
666 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
668 def _real_extract(self, url):
669 # Extract id and simplified title from URL
670 mobj = re.match(self._VALID_URL, url)
672 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
675 video_id = mobj.group(1).split('_')[0].split('?')[0]
677 video_extension = 'mp4'
679 # Retrieve video webpage to extract further information
680 request = compat_urllib_request.Request(url)
681 request.add_header('Cookie', 'family_filter=off')
683 self.report_download_webpage(video_id)
684 webpage_bytes = compat_urllib_request.urlopen(request).read()
685 webpage = webpage_bytes.decode('utf-8')
686 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
687 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
690 # Extract URL, uploader and title from webpage
691 self.report_extraction(video_id)
692 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
694 self._downloader.trouble(u'ERROR: unable to extract media URL')
696 flashvars = compat_urllib_parse.unquote(mobj.group(1))
698 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
701 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
704 self._downloader.trouble(u'ERROR: unable to extract video URL')
707 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
709 self._downloader.trouble(u'ERROR: unable to extract video URL')
712 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
714 # TODO: support choosing qualities
716 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
718 self._downloader.trouble(u'ERROR: unable to extract title')
720 video_title = unescapeHTML(mobj.group('title'))
722 video_uploader = None
723 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
725 # lookin for official user
726 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
727 if mobj_official is None:
728 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
730 video_uploader = mobj_official.group(1)
732 video_uploader = mobj.group(1)
734 video_upload_date = None
735 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
737 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
742 'uploader': video_uploader,
743 'upload_date': video_upload_date,
744 'title': video_title,
745 'ext': video_extension,
749 class PhotobucketIE(InfoExtractor):
750 """Information extractor for photobucket.com."""
752 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
753 IE_NAME = u'photobucket'
755 def __init__(self, downloader=None):
756 InfoExtractor.__init__(self, downloader)
758 def report_download_webpage(self, video_id):
759 """Report webpage download."""
760 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
762 def report_extraction(self, video_id):
763 """Report information extraction."""
764 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
766 def _real_extract(self, url):
767 # Extract id from URL
768 mobj = re.match(self._VALID_URL, url)
770 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
773 video_id = mobj.group(1)
775 video_extension = 'flv'
777 # Retrieve video webpage to extract further information
778 request = compat_urllib_request.Request(url)
780 self.report_download_webpage(video_id)
781 webpage = compat_urllib_request.urlopen(request).read()
782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
786 # Extract URL, uploader, and title from webpage
787 self.report_extraction(video_id)
788 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
790 self._downloader.trouble(u'ERROR: unable to extract media URL')
792 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
796 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
798 self._downloader.trouble(u'ERROR: unable to extract title')
800 video_title = mobj.group(1).decode('utf-8')
802 video_uploader = mobj.group(2).decode('utf-8')
805 'id': video_id.decode('utf-8'),
806 'url': video_url.decode('utf-8'),
807 'uploader': video_uploader,
809 'title': video_title,
810 'ext': video_extension.decode('utf-8'),
814 class YahooIE(InfoExtractor):
815 """Information extractor for video.yahoo.com."""
818 # _VALID_URL matches all Yahoo! Video URLs
819 # _VPAGE_URL matches only the extractable '/watch/' URLs
820 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
821 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
822 IE_NAME = u'video.yahoo'
824 def __init__(self, downloader=None):
825 InfoExtractor.__init__(self, downloader)
827 def report_download_webpage(self, video_id):
828 """Report webpage download."""
829 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
831 def report_extraction(self, video_id):
832 """Report information extraction."""
833 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
835 def _real_extract(self, url, new_video=True):
836 # Extract ID from URL
837 mobj = re.match(self._VALID_URL, url)
839 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
842 video_id = mobj.group(2)
843 video_extension = 'flv'
845 # Rewrite valid but non-extractable URLs as
846 # extractable English language /watch/ URLs
847 if re.match(self._VPAGE_URL, url) is None:
848 request = compat_urllib_request.Request(url)
850 webpage = compat_urllib_request.urlopen(request).read()
851 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
852 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
855 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
857 self._downloader.trouble(u'ERROR: Unable to extract id field')
859 yahoo_id = mobj.group(1)
861 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
863 self._downloader.trouble(u'ERROR: Unable to extract vid field')
865 yahoo_vid = mobj.group(1)
867 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
868 return self._real_extract(url, new_video=False)
870 # Retrieve video webpage to extract further information
871 request = compat_urllib_request.Request(url)
873 self.report_download_webpage(video_id)
874 webpage = compat_urllib_request.urlopen(request).read()
875 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
876 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
879 # Extract uploader and title from webpage
880 self.report_extraction(video_id)
881 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
883 self._downloader.trouble(u'ERROR: unable to extract video title')
885 video_title = mobj.group(1).decode('utf-8')
887 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
889 self._downloader.trouble(u'ERROR: unable to extract video uploader')
891 video_uploader = mobj.group(1).decode('utf-8')
893 # Extract video thumbnail
894 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
896 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
898 video_thumbnail = mobj.group(1).decode('utf-8')
900 # Extract video description
901 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video description')
905 video_description = mobj.group(1).decode('utf-8')
906 if not video_description:
907 video_description = 'No description available.'
909 # Extract video height and width
910 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
912 self._downloader.trouble(u'ERROR: unable to extract video height')
914 yv_video_height = mobj.group(1)
916 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
918 self._downloader.trouble(u'ERROR: unable to extract video width')
920 yv_video_width = mobj.group(1)
922 # Retrieve video playlist to extract media URL
923 # I'm not completely sure what all these options are, but we
924 # seem to need most of them, otherwise the server sends a 401.
925 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
926 yv_bitrate = '700' # according to Wikipedia this is hard-coded
927 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
928 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
929 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
931 self.report_download_webpage(video_id)
932 webpage = compat_urllib_request.urlopen(request).read()
933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
934 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
937 # Extract media URL from playlist XML
938 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
940 self._downloader.trouble(u'ERROR: Unable to extract media URL')
942 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
943 video_url = unescapeHTML(video_url)
946 'id': video_id.decode('utf-8'),
948 'uploader': video_uploader,
950 'title': video_title,
951 'ext': video_extension.decode('utf-8'),
952 'thumbnail': video_thumbnail.decode('utf-8'),
953 'description': video_description,
957 class VimeoIE(InfoExtractor):
958 """Information extractor for vimeo.com."""
960 # _VALID_URL matches Vimeo URLs
961 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
964 def __init__(self, downloader=None):
965 InfoExtractor.__init__(self, downloader)
967 def report_download_webpage(self, video_id):
968 """Report webpage download."""
969 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
971 def report_extraction(self, video_id):
972 """Report information extraction."""
973 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
975 def _real_extract(self, url, new_video=True):
976 # Extract ID from URL
977 mobj = re.match(self._VALID_URL, url)
979 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
982 video_id = mobj.group(1)
984 # Retrieve video webpage to extract further information
985 request = compat_urllib_request.Request(url, None, std_headers)
987 self.report_download_webpage(video_id)
988 webpage_bytes = compat_urllib_request.urlopen(request).read()
989 webpage = webpage_bytes.decode('utf-8')
990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
991 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
994 # Now we begin extracting as much information as we can from what we
995 # retrieved. First we extract the information common to all extractors,
996 # and latter we extract those that are Vimeo specific.
997 self.report_extraction(video_id)
999 # Extract the config JSON
1001 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1002 config = json.loads(config)
1004 self._downloader.trouble(u'ERROR: unable to extract info section')
1008 video_title = config["video"]["title"]
1010 # Extract uploader and uploader_id
1011 video_uploader = config["video"]["owner"]["name"]
1012 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1014 # Extract video thumbnail
1015 video_thumbnail = config["video"]["thumbnail"]
1017 # Extract video description
1018 video_description = get_element_by_attribute("itemprop", "description", webpage)
1019 if video_description: video_description = clean_html(video_description)
1020 else: video_description = ''
1022 # Extract upload date
1023 video_upload_date = None
1024 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1025 if mobj is not None:
1026 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1028 # Vimeo specific: extract request signature and timestamp
1029 sig = config['request']['signature']
1030 timestamp = config['request']['timestamp']
1032 # Vimeo specific: extract video codec and quality information
1033 # First consider quality, then codecs, then take everything
1034 # TODO bind to format param
1035 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1036 files = { 'hd': [], 'sd': [], 'other': []}
1037 for codec_name, codec_extension in codecs:
1038 if codec_name in config["video"]["files"]:
1039 if 'hd' in config["video"]["files"][codec_name]:
1040 files['hd'].append((codec_name, codec_extension, 'hd'))
1041 elif 'sd' in config["video"]["files"][codec_name]:
1042 files['sd'].append((codec_name, codec_extension, 'sd'))
1044 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1046 for quality in ('hd', 'sd', 'other'):
1047 if len(files[quality]) > 0:
1048 video_quality = files[quality][0][2]
1049 video_codec = files[quality][0][0]
1050 video_extension = files[quality][0][1]
1051 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1054 self._downloader.trouble(u'ERROR: no known codec found')
1057 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1058 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1063 'uploader': video_uploader,
1064 'uploader_id': video_uploader_id,
1065 'upload_date': video_upload_date,
1066 'title': video_title,
1067 'ext': video_extension,
1068 'thumbnail': video_thumbnail,
1069 'description': video_description,
1073 class ArteTvIE(InfoExtractor):
1074 """arte.tv information extractor."""
1076 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1077 _LIVE_URL = r'index-[0-9]+\.html$'
1079 IE_NAME = u'arte.tv'
1081 def __init__(self, downloader=None):
1082 InfoExtractor.__init__(self, downloader)
1084 def report_download_webpage(self, video_id):
1085 """Report webpage download."""
1086 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1088 def report_extraction(self, video_id):
1089 """Report information extraction."""
1090 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1092 def fetch_webpage(self, url):
1093 self._downloader.increment_downloads()
1094 request = compat_urllib_request.Request(url)
1096 self.report_download_webpage(url)
1097 webpage = compat_urllib_request.urlopen(request).read()
1098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1101 except ValueError as err:
1102 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1106 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1107 page = self.fetch_webpage(url)
1108 mobj = re.search(regex, page, regexFlags)
1112 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1115 for (i, key, err) in matchTuples:
1116 if mobj.group(i) is None:
1117 self._downloader.trouble(err)
1120 info[key] = mobj.group(i)
1124 def extractLiveStream(self, url):
1125 video_lang = url.split('/')[-4]
1126 info = self.grep_webpage(
1128 r'src="(.*?/videothek_js.*?\.js)',
1131 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1134 http_host = url.split('/')[2]
1135 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1136 info = self.grep_webpage(
1138 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1139 '(http://.*?\.swf).*?' +
1143 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1144 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1145 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1148 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1150 def extractPlus7Stream(self, url):
1151 video_lang = url.split('/')[-3]
1152 info = self.grep_webpage(
1154 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1157 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1160 next_url = compat_urllib_parse.unquote(info.get('url'))
1161 info = self.grep_webpage(
1163 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1166 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1169 next_url = compat_urllib_parse.unquote(info.get('url'))
1171 info = self.grep_webpage(
1173 r'<video id="(.*?)".*?>.*?' +
1174 '<name>(.*?)</name>.*?' +
1175 '<dateVideo>(.*?)</dateVideo>.*?' +
1176 '<url quality="hd">(.*?)</url>',
1179 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1180 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1181 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1182 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1187 'id': info.get('id'),
1188 'url': compat_urllib_parse.unquote(info.get('url')),
1189 'uploader': u'arte.tv',
1190 'upload_date': info.get('date'),
1191 'title': info.get('title').decode('utf-8'),
1197 def _real_extract(self, url):
1198 video_id = url.split('/')[-1]
1199 self.report_extraction(video_id)
1201 if re.search(self._LIVE_URL, video_id) is not None:
1202 self.extractLiveStream(url)
1205 info = self.extractPlus7Stream(url)
1210 class GenericIE(InfoExtractor):
1211 """Generic last-resort information extractor."""
1214 IE_NAME = u'generic'
1216 def __init__(self, downloader=None):
1217 InfoExtractor.__init__(self, downloader)
1219 def report_download_webpage(self, video_id):
1220 """Report webpage download."""
1221 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1222 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1224 def report_extraction(self, video_id):
1225 """Report information extraction."""
1226 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1228 def report_following_redirect(self, new_url):
1229 """Report information extraction."""
1230 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1232 def _test_redirect(self, url):
1233 """Check if it is a redirect, like url shorteners, in case restart chain."""
1234 class HeadRequest(compat_urllib_request.Request):
1235 def get_method(self):
1238 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1240 Subclass the HTTPRedirectHandler to make it use our
1241 HeadRequest also on the redirected URL
1243 def redirect_request(self, req, fp, code, msg, headers, newurl):
1244 if code in (301, 302, 303, 307):
1245 newurl = newurl.replace(' ', '%20')
1246 newheaders = dict((k,v) for k,v in req.headers.items()
1247 if k.lower() not in ("content-length", "content-type"))
1248 return HeadRequest(newurl,
1250 origin_req_host=req.get_origin_req_host(),
1253 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1255 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1257 Fallback to GET if HEAD is not allowed (405 HTTP error)
1259 def http_error_405(self, req, fp, code, msg, headers):
1263 newheaders = dict((k,v) for k,v in req.headers.items()
1264 if k.lower() not in ("content-length", "content-type"))
1265 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1267 origin_req_host=req.get_origin_req_host(),
1271 opener = compat_urllib_request.OpenerDirector()
1272 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1273 HTTPMethodFallback, HEADRedirectHandler,
1274 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1275 opener.add_handler(handler())
1277 response = opener.open(HeadRequest(url))
1278 new_url = response.geturl()
1283 self.report_following_redirect(new_url)
1284 self._downloader.download([new_url])
1287 def _real_extract(self, url):
1288 if self._test_redirect(url): return
1290 video_id = url.split('/')[-1]
1291 request = compat_urllib_request.Request(url)
1293 self.report_download_webpage(video_id)
1294 webpage = compat_urllib_request.urlopen(request).read()
1295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1296 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1298 except ValueError as err:
1299 # since this is the last-resort InfoExtractor, if
1300 # this error is thrown, it'll be thrown here
1301 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1304 self.report_extraction(video_id)
1305 # Start with something easy: JW Player in SWFObject
1306 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1308 # Broaden the search a little bit
1309 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1311 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1314 # It's possible that one of the regexes
1315 # matched, but returned an empty group:
1316 if mobj.group(1) is None:
1317 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 video_url = compat_urllib_parse.unquote(mobj.group(1))
1321 video_id = os.path.basename(video_url)
1323 # here's a fun little line of code for you:
1324 video_extension = os.path.splitext(video_id)[1][1:]
1325 video_id = os.path.splitext(video_id)[0]
1327 # it's tempting to parse this further, but you would
1328 # have to take into account all the variations like
1329 # Video Title - Site Name
1330 # Site Name | Video Title
1331 # Video Title - Tagline | Site Name
1332 # and so on and so forth; it's just not practical
1333 mobj = re.search(r'<title>(.*)</title>', webpage)
1335 self._downloader.trouble(u'ERROR: unable to extract title')
1337 video_title = mobj.group(1)
1339 # video uploader is domain name
1340 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1342 self._downloader.trouble(u'ERROR: unable to extract title')
1344 video_uploader = mobj.group(1)
1349 'uploader': video_uploader,
1350 'upload_date': None,
1351 'title': video_title,
1352 'ext': video_extension,
1356 class YoutubeSearchIE(InfoExtractor):
1357 """Information Extractor for YouTube search queries."""
1358 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1359 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1360 _max_youtube_results = 1000
1361 IE_NAME = u'youtube:search'
1363 def __init__(self, downloader=None):
1364 InfoExtractor.__init__(self, downloader)
1366 def report_download_page(self, query, pagenum):
1367 """Report attempt to download search page with given number."""
1368 query = query.decode(preferredencoding())
1369 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1371 def _real_extract(self, query):
1372 mobj = re.match(self._VALID_URL, query)
1374 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1377 prefix, query = query.split(':')
1379 query = query.encode('utf-8')
1381 self._download_n_results(query, 1)
1383 elif prefix == 'all':
1384 self._download_n_results(query, self._max_youtube_results)
1390 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1392 elif n > self._max_youtube_results:
1393 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1394 n = self._max_youtube_results
1395 self._download_n_results(query, n)
1397 except ValueError: # parsing prefix as integer fails
1398 self._download_n_results(query, 1)
1401 def _download_n_results(self, query, n):
1402 """Downloads a specified number of results for a query"""
1408 while (50 * pagenum) < limit:
1409 self.report_download_page(query, pagenum+1)
1410 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1411 request = compat_urllib_request.Request(result_url)
1413 data = compat_urllib_request.urlopen(request).read()
1414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1415 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1417 api_response = json.loads(data)['data']
1419 new_ids = list(video['id'] for video in api_response['items'])
1420 video_ids += new_ids
1422 limit = min(n, api_response['totalItems'])
1425 if len(video_ids) > n:
1426 video_ids = video_ids[:n]
1427 for id in video_ids:
1428 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1432 class GoogleSearchIE(InfoExtractor):
1433 """Information Extractor for Google Video search queries."""
1434 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1435 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1436 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1437 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1438 _max_google_results = 1000
1439 IE_NAME = u'video.google:search'
1441 def __init__(self, downloader=None):
1442 InfoExtractor.__init__(self, downloader)
1444 def report_download_page(self, query, pagenum):
1445 """Report attempt to download playlist page with given number."""
1446 query = query.decode(preferredencoding())
1447 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1449 def _real_extract(self, query):
1450 mobj = re.match(self._VALID_URL, query)
1452 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1455 prefix, query = query.split(':')
1457 query = query.encode('utf-8')
1459 self._download_n_results(query, 1)
1461 elif prefix == 'all':
1462 self._download_n_results(query, self._max_google_results)
1468 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1470 elif n > self._max_google_results:
1471 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1472 n = self._max_google_results
1473 self._download_n_results(query, n)
1475 except ValueError: # parsing prefix as integer fails
1476 self._download_n_results(query, 1)
1479 def _download_n_results(self, query, n):
1480 """Downloads a specified number of results for a query"""
1486 self.report_download_page(query, pagenum)
1487 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1488 request = compat_urllib_request.Request(result_url)
1490 page = compat_urllib_request.urlopen(request).read()
1491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1492 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1495 # Extract video identifiers
1496 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1497 video_id = mobj.group(1)
1498 if video_id not in video_ids:
1499 video_ids.append(video_id)
1500 if len(video_ids) == n:
1501 # Specified n videos reached
1502 for id in video_ids:
1503 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1506 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1507 for id in video_ids:
1508 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1511 pagenum = pagenum + 1
1514 class YahooSearchIE(InfoExtractor):
1515 """Information Extractor for Yahoo! Video search queries."""
1518 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1519 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1520 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1521 _MORE_PAGES_INDICATOR = r'\s*Next'
1522 _max_yahoo_results = 1000
1523 IE_NAME = u'video.yahoo:search'
1525 def __init__(self, downloader=None):
1526 InfoExtractor.__init__(self, downloader)
1528 def report_download_page(self, query, pagenum):
1529 """Report attempt to download playlist page with given number."""
1530 query = query.decode(preferredencoding())
1531 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1533 def _real_extract(self, query):
1534 mobj = re.match(self._VALID_URL, query)
1536 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1539 prefix, query = query.split(':')
1541 query = query.encode('utf-8')
1543 self._download_n_results(query, 1)
1545 elif prefix == 'all':
1546 self._download_n_results(query, self._max_yahoo_results)
1552 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1554 elif n > self._max_yahoo_results:
1555 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1556 n = self._max_yahoo_results
1557 self._download_n_results(query, n)
1559 except ValueError: # parsing prefix as integer fails
1560 self._download_n_results(query, 1)
1563 def _download_n_results(self, query, n):
1564 """Downloads a specified number of results for a query"""
1567 already_seen = set()
1571 self.report_download_page(query, pagenum)
1572 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1573 request = compat_urllib_request.Request(result_url)
1575 page = compat_urllib_request.urlopen(request).read()
1576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1577 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1580 # Extract video identifiers
1581 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1582 video_id = mobj.group(1)
1583 if video_id not in already_seen:
1584 video_ids.append(video_id)
1585 already_seen.add(video_id)
1586 if len(video_ids) == n:
1587 # Specified n videos reached
1588 for id in video_ids:
1589 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1592 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1593 for id in video_ids:
1594 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1597 pagenum = pagenum + 1
1600 class YoutubePlaylistIE(InfoExtractor):
1601 """Information Extractor for YouTube playlists."""
1603 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1604 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1605 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1606 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1607 IE_NAME = u'youtube:playlist'
1609 def __init__(self, downloader=None):
1610 InfoExtractor.__init__(self, downloader)
1612 def report_download_page(self, playlist_id, pagenum):
1613 """Report attempt to download playlist page with given number."""
1614 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1616 def _real_extract(self, url):
1617 # Extract playlist id
1618 mobj = re.match(self._VALID_URL, url)
1620 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1624 if mobj.group(3) is not None:
1625 self._downloader.download([mobj.group(3)])
1628 # Download playlist pages
1629 # prefix is 'p' as default for playlists but there are other types that need extra care
1630 playlist_prefix = mobj.group(1)
1631 if playlist_prefix == 'a':
1632 playlist_access = 'artist'
1634 playlist_prefix = 'p'
1635 playlist_access = 'view_play_list'
1636 playlist_id = mobj.group(2)
1641 self.report_download_page(playlist_id, pagenum)
1642 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1643 request = compat_urllib_request.Request(url)
1645 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1646 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650 # Extract video identifiers
1652 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1653 if mobj.group(1) not in ids_in_page:
1654 ids_in_page.append(mobj.group(1))
1655 video_ids.extend(ids_in_page)
1657 if self._MORE_PAGES_INDICATOR not in page:
1659 pagenum = pagenum + 1
1661 total = len(video_ids)
1663 playliststart = self._downloader.params.get('playliststart', 1) - 1
1664 playlistend = self._downloader.params.get('playlistend', -1)
1665 if playlistend == -1:
1666 video_ids = video_ids[playliststart:]
1668 video_ids = video_ids[playliststart:playlistend]
1670 if len(video_ids) == total:
1671 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1673 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1675 for id in video_ids:
1676 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1680 class YoutubeChannelIE(InfoExtractor):
1681 """Information Extractor for YouTube channels."""
1683 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1684 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1685 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1686 IE_NAME = u'youtube:channel'
1688 def report_download_page(self, channel_id, pagenum):
1689 """Report attempt to download channel page with given number."""
1690 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1692 def _real_extract(self, url):
1693 # Extract channel id
1694 mobj = re.match(self._VALID_URL, url)
1696 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1699 # Download channel pages
1700 channel_id = mobj.group(1)
1705 self.report_download_page(channel_id, pagenum)
1706 url = self._TEMPLATE_URL % (channel_id, pagenum)
1707 request = compat_urllib_request.Request(url)
1709 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1711 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1714 # Extract video identifiers
1716 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1717 if mobj.group(1) not in ids_in_page:
1718 ids_in_page.append(mobj.group(1))
1719 video_ids.extend(ids_in_page)
1721 if self._MORE_PAGES_INDICATOR not in page:
1723 pagenum = pagenum + 1
1725 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1727 for id in video_ids:
1728 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1732 class YoutubeUserIE(InfoExtractor):
1733 """Information Extractor for YouTube users."""
1735 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1736 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1737 _GDATA_PAGE_SIZE = 50
1738 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1739 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1740 IE_NAME = u'youtube:user'
1742 def __init__(self, downloader=None):
1743 InfoExtractor.__init__(self, downloader)
1745 def report_download_page(self, username, start_index):
1746 """Report attempt to download user page."""
1747 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1748 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1750 def _real_extract(self, url):
1752 mobj = re.match(self._VALID_URL, url)
1754 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1757 username = mobj.group(1)
1759 # Download video ids using YouTube Data API. Result size per
1760 # query is limited (currently to 50 videos) so we need to query
1761 # page by page until there are no video ids - it means we got
1768 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1769 self.report_download_page(username, start_index)
1771 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1774 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1775 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1776 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1779 # Extract video identifiers
1782 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1783 if mobj.group(1) not in ids_in_page:
1784 ids_in_page.append(mobj.group(1))
1786 video_ids.extend(ids_in_page)
1788 # A little optimization - if current page is not
1789 # "full", ie. does not contain PAGE_SIZE video ids then
1790 # we can assume that this page is the last one - there
1791 # are no more ids on further pages - no need to query
1794 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1799 all_ids_count = len(video_ids)
1800 playliststart = self._downloader.params.get('playliststart', 1) - 1
1801 playlistend = self._downloader.params.get('playlistend', -1)
1803 if playlistend == -1:
1804 video_ids = video_ids[playliststart:]
1806 video_ids = video_ids[playliststart:playlistend]
1808 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1809 (username, all_ids_count, len(video_ids)))
1811 for video_id in video_ids:
1812 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1815 class BlipTVUserIE(InfoExtractor):
1816 """Information Extractor for blip.tv users."""
1818 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1820 IE_NAME = u'blip.tv:user'
1822 def __init__(self, downloader=None):
1823 InfoExtractor.__init__(self, downloader)
1825 def report_download_page(self, username, pagenum):
1826 """Report attempt to download user page."""
1827 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1828 (self.IE_NAME, username, pagenum))
1830 def _real_extract(self, url):
1832 mobj = re.match(self._VALID_URL, url)
1834 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837 username = mobj.group(1)
1839 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1841 request = compat_urllib_request.Request(url)
1844 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1845 mobj = re.search(r'data-users-id="([^"]+)"', page)
1846 page_base = page_base % mobj.group(1)
1847 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1852 # Download video ids using BlipTV Ajax calls. Result size per
1853 # query is limited (currently to 12 videos) so we need to query
1854 # page by page until there are no video ids - it means we got
1861 self.report_download_page(username, pagenum)
1863 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1866 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1867 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1868 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1871 # Extract video identifiers
1874 for mobj in re.finditer(r'href="/([^"]+)"', page):
1875 if mobj.group(1) not in ids_in_page:
1876 ids_in_page.append(unescapeHTML(mobj.group(1)))
1878 video_ids.extend(ids_in_page)
1880 # A little optimization - if current page is not
1881 # "full", ie. does not contain PAGE_SIZE video ids then
1882 # we can assume that this page is the last one - there
1883 # are no more ids on further pages - no need to query
1886 if len(ids_in_page) < self._PAGE_SIZE:
1891 all_ids_count = len(video_ids)
1892 playliststart = self._downloader.params.get('playliststart', 1) - 1
1893 playlistend = self._downloader.params.get('playlistend', -1)
1895 if playlistend == -1:
1896 video_ids = video_ids[playliststart:]
1898 video_ids = video_ids[playliststart:playlistend]
1900 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1901 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1903 for video_id in video_ids:
1904 self._downloader.download([u'http://blip.tv/'+video_id])
1907 class DepositFilesIE(InfoExtractor):
1908 """Information extractor for depositfiles.com"""
1910 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1911 IE_NAME = u'DepositFiles'
1913 def __init__(self, downloader=None):
1914 InfoExtractor.__init__(self, downloader)
1916 def report_download_webpage(self, file_id):
1917 """Report webpage download."""
1918 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1920 def report_extraction(self, file_id):
1921 """Report information extraction."""
1922 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1924 def _real_extract(self, url):
1925 file_id = url.split('/')[-1]
1926 # Rebuild url in english locale
1927 url = 'http://depositfiles.com/en/files/' + file_id
1929 # Retrieve file webpage with 'Free download' button pressed
1930 free_download_indication = { 'gateway_result' : '1' }
1931 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1933 self.report_download_webpage(file_id)
1934 webpage = compat_urllib_request.urlopen(request).read()
1935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1936 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1939 # Search for the real file URL
1940 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1941 if (mobj is None) or (mobj.group(1) is None):
1942 # Try to figure out reason of the error.
1943 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1944 if (mobj is not None) and (mobj.group(1) is not None):
1945 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1946 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1948 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1951 file_url = mobj.group(1)
1952 file_extension = os.path.splitext(file_url)[1][1:]
1954 # Search for file title
1955 mobj = re.search(r'<b title="(.*?)">', webpage)
1957 self._downloader.trouble(u'ERROR: unable to extract title')
1959 file_title = mobj.group(1).decode('utf-8')
1962 'id': file_id.decode('utf-8'),
1963 'url': file_url.decode('utf-8'),
1965 'upload_date': None,
1966 'title': file_title,
1967 'ext': file_extension.decode('utf-8'),
1971 class FacebookIE(InfoExtractor):
1972 """Information Extractor for Facebook"""
1975 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1976 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1977 _NETRC_MACHINE = 'facebook'
1978 _available_formats = ['video', 'highqual', 'lowqual']
1979 _video_extensions = {
1984 IE_NAME = u'facebook'
1986 def __init__(self, downloader=None):
1987 InfoExtractor.__init__(self, downloader)
1989 def _reporter(self, message):
1990 """Add header and report message."""
1991 self._downloader.to_screen(u'[facebook] %s' % message)
1993 def report_login(self):
1994 """Report attempt to log in."""
1995 self._reporter(u'Logging in')
1997 def report_video_webpage_download(self, video_id):
1998 """Report attempt to download video webpage."""
1999 self._reporter(u'%s: Downloading video webpage' % video_id)
2001 def report_information_extraction(self, video_id):
2002 """Report attempt to extract video information."""
2003 self._reporter(u'%s: Extracting video information' % video_id)
2005 def _parse_page(self, video_webpage):
2006 """Extract video information from page"""
2008 data = {'title': r'\("video_title", "(.*?)"\)',
2009 'description': r'<div class="datawrap">(.*?)</div>',
2010 'owner': r'\("video_owner_name", "(.*?)"\)',
2011 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2014 for piece in data.keys():
2015 mobj = re.search(data[piece], video_webpage)
2016 if mobj is not None:
2017 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2021 for fmt in self._available_formats:
2022 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2023 if mobj is not None:
2024 # URL is in a Javascript segment inside an escaped Unicode format within
2025 # the generally utf-8 page
2026 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2027 video_info['video_urls'] = video_urls
2031 def _real_initialize(self):
2032 if self._downloader is None:
2037 downloader_params = self._downloader.params
2039 # Attempt to use provided username and password or .netrc data
2040 if downloader_params.get('username', None) is not None:
2041 useremail = downloader_params['username']
2042 password = downloader_params['password']
2043 elif downloader_params.get('usenetrc', False):
2045 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2046 if info is not None:
2050 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2051 except (IOError, netrc.NetrcParseError) as err:
2052 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2055 if useremail is None:
2064 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2067 login_results = compat_urllib_request.urlopen(request).read()
2068 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2069 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2071 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2075 def _real_extract(self, url):
2076 mobj = re.match(self._VALID_URL, url)
2078 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2080 video_id = mobj.group('ID')
2083 self.report_video_webpage_download(video_id)
2084 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2086 page = compat_urllib_request.urlopen(request)
2087 video_webpage = page.read()
2088 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2089 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2092 # Start extracting information
2093 self.report_information_extraction(video_id)
2095 # Extract information
2096 video_info = self._parse_page(video_webpage)
2099 if 'owner' not in video_info:
2100 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2102 video_uploader = video_info['owner']
2105 if 'title' not in video_info:
2106 self._downloader.trouble(u'ERROR: unable to extract video title')
2108 video_title = video_info['title']
2109 video_title = video_title.decode('utf-8')
2112 if 'thumbnail' not in video_info:
2113 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2114 video_thumbnail = ''
2116 video_thumbnail = video_info['thumbnail']
2120 if 'upload_date' in video_info:
2121 upload_time = video_info['upload_date']
2122 timetuple = email.utils.parsedate_tz(upload_time)
2123 if timetuple is not None:
2125 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2130 video_description = video_info.get('description', 'No description available.')
2132 url_map = video_info['video_urls']
2134 # Decide which formats to download
2135 req_format = self._downloader.params.get('format', None)
2136 format_limit = self._downloader.params.get('format_limit', None)
2138 if format_limit is not None and format_limit in self._available_formats:
2139 format_list = self._available_formats[self._available_formats.index(format_limit):]
2141 format_list = self._available_formats
2142 existing_formats = [x for x in format_list if x in url_map]
2143 if len(existing_formats) == 0:
2144 self._downloader.trouble(u'ERROR: no known formats available for video')
2146 if req_format is None:
2147 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2148 elif req_format == 'worst':
2149 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2150 elif req_format == '-1':
2151 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2154 if req_format not in url_map:
2155 self._downloader.trouble(u'ERROR: requested format not available')
2157 video_url_list = [(req_format, url_map[req_format])] # Specific format
2160 for format_param, video_real_url in video_url_list:
2162 video_extension = self._video_extensions.get(format_param, 'mp4')
2165 'id': video_id.decode('utf-8'),
2166 'url': video_real_url.decode('utf-8'),
2167 'uploader': video_uploader.decode('utf-8'),
2168 'upload_date': upload_date,
2169 'title': video_title,
2170 'ext': video_extension.decode('utf-8'),
2171 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2172 'thumbnail': video_thumbnail.decode('utf-8'),
2173 'description': video_description.decode('utf-8'),
2177 class BlipTVIE(InfoExtractor):
2178 """Information extractor for blip.tv"""
2180 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2181 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2182 IE_NAME = u'blip.tv'
2184 def report_extraction(self, file_id):
2185 """Report information extraction."""
2186 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2188 def report_direct_download(self, title):
2189 """Report information extraction."""
2190 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2192 def _real_extract(self, url):
2193 mobj = re.match(self._VALID_URL, url)
2195 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2202 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2203 request = compat_urllib_request.Request(json_url)
2204 self.report_extraction(mobj.group(1))
2207 urlh = compat_urllib_request.urlopen(request)
2208 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2209 basename = url.split('/')[-1]
2210 title,ext = os.path.splitext(basename)
2211 title = title.decode('UTF-8')
2212 ext = ext.replace('.', '')
2213 self.report_direct_download(title)
2218 'upload_date': None,
2223 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2224 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2226 if info is None: # Regular URL
2228 json_code_bytes = urlh.read()
2229 json_code = json_code_bytes.decode('utf-8')
2230 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2231 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2235 json_data = json.loads(json_code)
2236 if 'Post' in json_data:
2237 data = json_data['Post']
2241 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2242 video_url = data['media']['url']
2243 umobj = re.match(self._URL_EXT, video_url)
2245 raise ValueError('Can not determine filename extension')
2246 ext = umobj.group(1)
2249 'id': data['item_id'],
2251 'uploader': data['display_name'],
2252 'upload_date': upload_date,
2253 'title': data['title'],
2255 'format': data['media']['mimeType'],
2256 'thumbnail': data['thumbnailUrl'],
2257 'description': data['description'],
2258 'player_url': data['embedUrl']
2260 except (ValueError,KeyError) as err:
2261 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2264 std_headers['User-Agent'] = 'iTunes/10.6.1'
2268 class MyVideoIE(InfoExtractor):
2269 """Information Extractor for myvideo.de."""
2271 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2272 IE_NAME = u'myvideo'
2274 def __init__(self, downloader=None):
2275 InfoExtractor.__init__(self, downloader)
2277 def report_download_webpage(self, video_id):
2278 """Report webpage download."""
2279 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2281 def report_extraction(self, video_id):
2282 """Report information extraction."""
2283 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2285 def _real_extract(self,url):
2286 mobj = re.match(self._VALID_URL, url)
2288 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2291 video_id = mobj.group(1)
2294 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2296 self.report_download_webpage(video_id)
2297 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2298 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2299 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2302 self.report_extraction(video_id)
2303 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2306 self._downloader.trouble(u'ERROR: unable to extract media URL')
2308 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2310 mobj = re.search('<title>([^<]+)</title>', webpage)
2312 self._downloader.trouble(u'ERROR: unable to extract title')
2315 video_title = mobj.group(1)
2321 'upload_date': None,
2322 'title': video_title,
2326 class ComedyCentralIE(InfoExtractor):
2327 """Information extractor for The Daily Show and Colbert Report """
2329 # urls can be abbreviations like :thedailyshow or :colbert
2330 # urls for episodes like:
2331 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2332 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2333 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2334 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2335 |(https?://)?(www\.)?
2336 (?P<showname>thedailyshow|colbertnation)\.com/
2337 (full-episodes/(?P<episode>.*)|
2339 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2340 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2342 IE_NAME = u'comedycentral'
2344 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2346 _video_extensions = {
2354 _video_dimensions = {
2363 def suitable(self, url):
2364 """Receives a URL and returns True if suitable for this IE."""
2365 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2367 def report_extraction(self, episode_id):
2368 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2370 def report_config_download(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2373 def report_index_download(self, episode_id):
2374 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2376 def report_player_url(self, episode_id):
2377 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2380 def _print_formats(self, formats):
2381 print('Available formats:')
2383 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2386 def _real_extract(self, url):
2387 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2389 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2392 if mobj.group('shortname'):
2393 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2394 url = u'http://www.thedailyshow.com/full-episodes/'
2396 url = u'http://www.colbertnation.com/full-episodes/'
2397 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2398 assert mobj is not None
2400 if mobj.group('clip'):
2401 if mobj.group('showname') == 'thedailyshow':
2402 epTitle = mobj.group('tdstitle')
2404 epTitle = mobj.group('cntitle')
2407 dlNewest = not mobj.group('episode')
2409 epTitle = mobj.group('showname')
2411 epTitle = mobj.group('episode')
2413 req = compat_urllib_request.Request(url)
2414 self.report_extraction(epTitle)
2416 htmlHandle = compat_urllib_request.urlopen(req)
2417 html = htmlHandle.read()
2418 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2422 url = htmlHandle.geturl()
2423 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2425 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2427 if mobj.group('episode') == '':
2428 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2430 epTitle = mobj.group('episode')
2432 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2434 if len(mMovieParams) == 0:
2435 # The Colbert Report embeds the information in a without
2436 # a URL prefix; so extract the alternate reference
2437 # and then add the URL prefix manually.
2439 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2440 if len(altMovieParams) == 0:
2441 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2444 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2446 playerUrl_raw = mMovieParams[0][0]
2447 self.report_player_url(epTitle)
2449 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2450 playerUrl = urlHandle.geturl()
2451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2455 uri = mMovieParams[0][1]
2456 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2457 self.report_index_download(epTitle)
2459 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2461 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2466 idoc = xml.etree.ElementTree.fromstring(indexXml)
2467 itemEls = idoc.findall('.//item')
2468 for itemEl in itemEls:
2469 mediaId = itemEl.findall('./guid')[0].text
2470 shortMediaId = mediaId.split(':')[-1]
2471 showId = mediaId.split(':')[-2].replace('.com', '')
2472 officialTitle = itemEl.findall('./title')[0].text
2473 officialDate = itemEl.findall('./pubDate')[0].text
2475 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2476 compat_urllib_parse.urlencode({'uri': mediaId}))
2477 configReq = compat_urllib_request.Request(configUrl)
2478 self.report_config_download(epTitle)
2480 configXml = compat_urllib_request.urlopen(configReq).read()
2481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2482 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2485 cdoc = xml.etree.ElementTree.fromstring(configXml)
2487 for rendition in cdoc.findall('.//rendition'):
2488 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2492 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2495 if self._downloader.params.get('listformats', None):
2496 self._print_formats([i[0] for i in turls])
2499 # For now, just pick the highest bitrate
2500 format,video_url = turls[-1]
2502 # Get the format arg from the arg stream
2503 req_format = self._downloader.params.get('format', None)
2505 # Select format if we can find one
2508 format, video_url = f, v
2511 # Patch to download from alternative CDN, which does not
2512 # break on current RTMPDump builds
2513 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2514 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2516 if video_url.startswith(broken_cdn):
2517 video_url = video_url.replace(broken_cdn, better_cdn)
2519 effTitle = showId + u'-' + epTitle
2524 'upload_date': officialDate,
2529 'description': officialTitle,
2530 'player_url': None #playerUrl
2533 results.append(info)
2538 class EscapistIE(InfoExtractor):
2539 """Information extractor for The Escapist """
2541 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2542 IE_NAME = u'escapist'
2544 def report_extraction(self, showName):
2545 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2547 def report_config_download(self, showName):
2548 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2550 def _real_extract(self, url):
2551 mobj = re.match(self._VALID_URL, url)
2553 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2555 showName = mobj.group('showname')
2556 videoId = mobj.group('episode')
2558 self.report_extraction(showName)
2560 webPage = compat_urllib_request.urlopen(url)
2561 webPageBytes = webPage.read()
2562 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2563 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2568 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2569 description = unescapeHTML(descMatch.group(1))
2570 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2571 imgUrl = unescapeHTML(imgMatch.group(1))
2572 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2573 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2574 configUrlMatch = re.search('config=(.*)$', playerUrl)
2575 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2577 self.report_config_download(showName)
2579 configJSON = compat_urllib_request.urlopen(configUrl)
2580 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2581 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2582 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2583 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2586 # Technically, it's JavaScript, not JSON
2587 configJSON = configJSON.replace("'", '"')
2590 config = json.loads(configJSON)
2591 except (ValueError,) as err:
2592 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2595 playlist = config['playlist']
2596 videoUrl = playlist[1]['url']
2601 'uploader': showName,
2602 'upload_date': None,
2605 'thumbnail': imgUrl,
2606 'description': description,
2607 'player_url': playerUrl,
2613 class CollegeHumorIE(InfoExtractor):
2614 """Information extractor for collegehumor.com"""
2617 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2618 IE_NAME = u'collegehumor'
2620 def report_manifest(self, video_id):
2621 """Report information extraction."""
2622 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2624 def report_extraction(self, video_id):
2625 """Report information extraction."""
2626 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2628 def _real_extract(self, url):
2629 mobj = re.match(self._VALID_URL, url)
2631 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2633 video_id = mobj.group('videoid')
2638 'upload_date': None,
2641 self.report_extraction(video_id)
2642 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2644 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2649 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2651 videoNode = mdoc.findall('./video')[0]
2652 info['description'] = videoNode.findall('./description')[0].text
2653 info['title'] = videoNode.findall('./caption')[0].text
2654 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2655 manifest_url = videoNode.findall('./file')[0].text
2657 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2660 manifest_url += '?hdcore=2.10.3'
2661 self.report_manifest(video_id)
2663 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2664 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2665 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2668 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2670 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2671 node_id = media_node.attrib['url']
2672 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2673 except IndexError as err:
2674 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2677 url_pr = compat_urllib_parse_urlparse(manifest_url)
2678 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2685 class XVideosIE(InfoExtractor):
2686 """Information extractor for xvideos.com"""
2688 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2689 IE_NAME = u'xvideos'
2691 def report_webpage(self, video_id):
2692 """Report information extraction."""
2693 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2695 def report_extraction(self, video_id):
2696 """Report information extraction."""
2697 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2702 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2704 video_id = mobj.group(1)
2706 self.report_webpage(video_id)
2708 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2710 webpage_bytes = compat_urllib_request.urlopen(request).read()
2711 webpage = webpage_bytes.decode('utf-8', 'replace')
2712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2713 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2716 self.report_extraction(video_id)
2720 mobj = re.search(r'flv_url=(.+?)&', webpage)
2722 self._downloader.trouble(u'ERROR: unable to extract video url')
2724 video_url = compat_urllib_parse.unquote(mobj.group(1))
2728 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2730 self._downloader.trouble(u'ERROR: unable to extract video title')
2732 video_title = mobj.group(1)
2735 # Extract video thumbnail
2736 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2738 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2740 video_thumbnail = mobj.group(0)
2746 'upload_date': None,
2747 'title': video_title,
2749 'thumbnail': video_thumbnail,
2750 'description': None,
2756 class SoundcloudIE(InfoExtractor):
2757 """Information extractor for soundcloud.com
2758 To access the media, the uid of the song and a stream token
2759 must be extracted from the page source and the script must make
2760 a request to media.soundcloud.com/crossdomain.xml. Then
2761 the media can be grabbed by requesting from an url composed
2762 of the stream token and uid
2765 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2766 IE_NAME = u'soundcloud'
2768 def __init__(self, downloader=None):
2769 InfoExtractor.__init__(self, downloader)
2771 def report_resolve(self, video_id):
2772 """Report information extraction."""
2773 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2775 def report_extraction(self, video_id):
2776 """Report information extraction."""
2777 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2779 def _real_extract(self, url):
2780 mobj = re.match(self._VALID_URL, url)
2782 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2785 # extract uploader (which is in the url)
2786 uploader = mobj.group(1)
2787 # extract simple title (uploader + slug of song title)
2788 slug_title = mobj.group(2)
2789 simple_title = uploader + u'-' + slug_title
2791 self.report_resolve('%s/%s' % (uploader, slug_title))
2793 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2794 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2795 request = compat_urllib_request.Request(resolv_url)
2797 info_json_bytes = compat_urllib_request.urlopen(request).read()
2798 info_json = info_json_bytes.decode('utf-8')
2799 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2800 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2803 info = json.loads(info_json)
2804 video_id = info['id']
2805 self.report_extraction('%s/%s' % (uploader, slug_title))
2807 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2808 request = compat_urllib_request.Request(streams_url)
2810 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2811 stream_json = stream_json_bytes.decode('utf-8')
2812 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2816 streams = json.loads(stream_json)
2817 mediaURL = streams['http_mp3_128_url']
2822 'uploader': info['user']['username'],
2823 'upload_date': info['created_at'],
2824 'title': info['title'],
2826 'description': info['description'],
2830 class InfoQIE(InfoExtractor):
2831 """Information extractor for infoq.com"""
2833 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2836 def report_webpage(self, video_id):
2837 """Report information extraction."""
2838 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2840 def report_extraction(self, video_id):
2841 """Report information extraction."""
2842 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2844 def _real_extract(self, url):
2845 mobj = re.match(self._VALID_URL, url)
2847 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2850 self.report_webpage(url)
2852 request = compat_urllib_request.Request(url)
2854 webpage = compat_urllib_request.urlopen(request).read()
2855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2856 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2859 self.report_extraction(url)
2863 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2865 self._downloader.trouble(u'ERROR: unable to extract video url')
2867 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2871 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2873 self._downloader.trouble(u'ERROR: unable to extract video title')
2875 video_title = mobj.group(1).decode('utf-8')
2877 # Extract description
2878 video_description = u'No description available.'
2879 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2880 if mobj is not None:
2881 video_description = mobj.group(1).decode('utf-8')
2883 video_filename = video_url.split('/')[-1]
2884 video_id, extension = video_filename.split('.')
2890 'upload_date': None,
2891 'title': video_title,
2892 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2894 'description': video_description,
2899 class MixcloudIE(InfoExtractor):
2900 """Information extractor for www.mixcloud.com"""
2902 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2903 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2904 IE_NAME = u'mixcloud'
2906 def __init__(self, downloader=None):
2907 InfoExtractor.__init__(self, downloader)
2909 def report_download_json(self, file_id):
2910 """Report JSON download."""
2911 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2913 def report_extraction(self, file_id):
2914 """Report information extraction."""
2915 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2917 def get_urls(self, jsonData, fmt, bitrate='best'):
2918 """Get urls from 'audio_formats' section in json"""
2921 bitrate_list = jsonData[fmt]
2922 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2923 bitrate = max(bitrate_list) # select highest
2925 url_list = jsonData[fmt][bitrate]
2926 except TypeError: # we have no bitrate info.
2927 url_list = jsonData[fmt]
2930 def check_urls(self, url_list):
2931 """Returns 1st active url from list"""
2932 for url in url_list:
2934 compat_urllib_request.urlopen(url)
2936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2941 def _print_formats(self, formats):
2942 print('Available formats:')
2943 for fmt in formats.keys():
2944 for b in formats[fmt]:
2946 ext = formats[fmt][b][0]
2947 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2948 except TypeError: # we have no bitrate info
2949 ext = formats[fmt][0]
2950 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2953 def _real_extract(self, url):
2954 mobj = re.match(self._VALID_URL, url)
2956 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2958 # extract uploader & filename from url
2959 uploader = mobj.group(1).decode('utf-8')
2960 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2962 # construct API request
2963 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2964 # retrieve .json file with links to files
2965 request = compat_urllib_request.Request(file_url)
2967 self.report_download_json(file_url)
2968 jsonData = compat_urllib_request.urlopen(request).read()
2969 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2970 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2974 json_data = json.loads(jsonData)
2975 player_url = json_data['player_swf_url']
2976 formats = dict(json_data['audio_formats'])
2978 req_format = self._downloader.params.get('format', None)
2981 if self._downloader.params.get('listformats', None):
2982 self._print_formats(formats)
2985 if req_format is None or req_format == 'best':
2986 for format_param in formats.keys():
2987 url_list = self.get_urls(formats, format_param)
2989 file_url = self.check_urls(url_list)
2990 if file_url is not None:
2993 if req_format not in formats:
2994 self._downloader.trouble(u'ERROR: format is not available')
2997 url_list = self.get_urls(formats, req_format)
2998 file_url = self.check_urls(url_list)
2999 format_param = req_format
3002 'id': file_id.decode('utf-8'),
3003 'url': file_url.decode('utf-8'),
3004 'uploader': uploader.decode('utf-8'),
3005 'upload_date': None,
3006 'title': json_data['name'],
3007 'ext': file_url.split('.')[-1].decode('utf-8'),
3008 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3009 'thumbnail': json_data['thumbnail_url'],
3010 'description': json_data['description'],
3011 'player_url': player_url.decode('utf-8'),
3014 class StanfordOpenClassroomIE(InfoExtractor):
3015 """Information extractor for Stanford's Open ClassRoom"""
3017 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3018 IE_NAME = u'stanfordoc'
3020 def report_download_webpage(self, objid):
3021 """Report information extraction."""
3022 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3024 def report_extraction(self, video_id):
3025 """Report information extraction."""
3026 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3028 def _real_extract(self, url):
3029 mobj = re.match(self._VALID_URL, url)
3031 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3034 if mobj.group('course') and mobj.group('video'): # A specific video
3035 course = mobj.group('course')
3036 video = mobj.group('video')
3038 'id': course + '_' + video,
3040 'upload_date': None,
3043 self.report_extraction(info['id'])
3044 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3045 xmlUrl = baseUrl + video + '.xml'
3047 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3048 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3049 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3051 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3053 info['title'] = mdoc.findall('./title')[0].text
3054 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3056 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3058 info['ext'] = info['url'].rpartition('.')[2]
3060 elif mobj.group('course'): # A course page
3061 course = mobj.group('course')
3066 'upload_date': None,
3069 self.report_download_webpage(info['id'])
3071 coursepage = compat_urllib_request.urlopen(url).read()
3072 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3073 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3076 m = re.search('<h1>([^<]+)</h1>', coursepage)
3078 info['title'] = unescapeHTML(m.group(1))
3080 info['title'] = info['id']
3082 m = re.search('<description>([^<]+)</description>', coursepage)
3084 info['description'] = unescapeHTML(m.group(1))
3086 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3089 'type': 'reference',
3090 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3094 for entry in info['list']:
3095 assert entry['type'] == 'reference'
3096 results += self.extract(entry['url'])
3101 'id': 'Stanford OpenClassroom',
3104 'upload_date': None,
3107 self.report_download_webpage(info['id'])
3108 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3110 rootpage = compat_urllib_request.urlopen(rootURL).read()
3111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3112 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3115 info['title'] = info['id']
3117 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3120 'type': 'reference',
3121 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3126 for entry in info['list']:
3127 assert entry['type'] == 'reference'
3128 results += self.extract(entry['url'])
3131 class MTVIE(InfoExtractor):
3132 """Information extractor for MTV.com"""
3134 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3137 def report_webpage(self, video_id):
3138 """Report information extraction."""
3139 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3141 def report_extraction(self, video_id):
3142 """Report information extraction."""
3143 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3145 def _real_extract(self, url):
3146 mobj = re.match(self._VALID_URL, url)
3148 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3150 if not mobj.group('proto'):
3151 url = 'http://' + url
3152 video_id = mobj.group('videoid')
3153 self.report_webpage(video_id)
3155 request = compat_urllib_request.Request(url)
3157 webpage = compat_urllib_request.urlopen(request).read()
3158 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3159 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3162 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3164 self._downloader.trouble(u'ERROR: unable to extract song name')
3166 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3167 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3169 self._downloader.trouble(u'ERROR: unable to extract performer')
3171 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3172 video_title = performer + ' - ' + song_name
3174 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3176 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3178 mtvn_uri = mobj.group(1)
3180 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3182 self._downloader.trouble(u'ERROR: unable to extract content id')
3184 content_id = mobj.group(1)
3186 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3187 self.report_extraction(video_id)
3188 request = compat_urllib_request.Request(videogen_url)
3190 metadataXml = compat_urllib_request.urlopen(request).read()
3191 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3192 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3195 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3196 renditions = mdoc.findall('.//rendition')
3198 # For now, always pick the highest quality.
3199 rendition = renditions[-1]
3202 _,_,ext = rendition.attrib['type'].partition('/')
3203 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3204 video_url = rendition.find('./src').text
3206 self._downloader.trouble('Invalid rendition field.')
3212 'uploader': performer,
3213 'upload_date': None,
3214 'title': video_title,
3222 class YoukuIE(InfoExtractor):
3224 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3227 def __init__(self, downloader=None):
3228 InfoExtractor.__init__(self, downloader)
3230 def report_download_webpage(self, file_id):
3231 """Report webpage download."""
3232 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3234 def report_extraction(self, file_id):
3235 """Report information extraction."""
3236 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3239 nowTime = int(time.time() * 1000)
3240 random1 = random.randint(1000,1998)
3241 random2 = random.randint(1000,9999)
3243 return "%d%d%d" %(nowTime,random1,random2)
3245 def _get_file_ID_mix_string(self, seed):
3247 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3249 for i in range(len(source)):
3250 seed = (seed * 211 + 30031 ) % 65536
3251 index = math.floor(seed / 65536 * len(source) )
3252 mixed.append(source[int(index)])
3253 source.remove(source[int(index)])
3254 #return ''.join(mixed)
3257 def _get_file_id(self, fileId, seed):
3258 mixed = self._get_file_ID_mix_string(seed)
3259 ids = fileId.split('*')
3263 realId.append(mixed[int(ch)])
3264 return ''.join(realId)
3266 def _real_extract(self, url):
3267 mobj = re.match(self._VALID_URL, url)
3269 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3271 video_id = mobj.group('ID')
3273 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3275 request = compat_urllib_request.Request(info_url, None, std_headers)
3277 self.report_download_webpage(video_id)
3278 jsondata = compat_urllib_request.urlopen(request).read()
3279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3280 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3283 self.report_extraction(video_id)
3285 jsonstr = jsondata.decode('utf-8')
3286 config = json.loads(jsonstr)
3288 video_title = config['data'][0]['title']
3289 seed = config['data'][0]['seed']
3291 format = self._downloader.params.get('format', None)
3292 supported_format = list(config['data'][0]['streamfileids'].keys())
3294 if format is None or format == 'best':
3295 if 'hd2' in supported_format:
3300 elif format == 'worst':
3308 fileid = config['data'][0]['streamfileids'][format]
3309 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3310 except (UnicodeDecodeError, ValueError, KeyError):
3311 self._downloader.trouble(u'ERROR: unable to extract info section')
3315 sid = self._gen_sid()
3316 fileid = self._get_file_id(fileid, seed)
3318 #column 8,9 of fileid represent the segment number
3319 #fileid[7:9] should be changed
3320 for index, key in enumerate(keys):
3322 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3323 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3326 'id': '%s_part%02d' % (video_id, index),
3327 'url': download_url,
3329 'upload_date': None,
3330 'title': video_title,
3333 files_info.append(info)
3338 class XNXXIE(InfoExtractor):
3339 """Information extractor for xnxx.com"""
3341 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3343 VIDEO_URL_RE = r'flv_url=(.*?)&'
3344 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3345 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3347 def report_webpage(self, video_id):
3348 """Report information extraction"""
3349 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3351 def report_extraction(self, video_id):
3352 """Report information extraction"""
3353 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3355 def _real_extract(self, url):
3356 mobj = re.match(self._VALID_URL, url)
3358 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3360 video_id = mobj.group(1)
3362 self.report_webpage(video_id)
3364 # Get webpage content
3366 webpage_bytes = compat_urllib_request.urlopen(url).read()
3367 webpage = webpage_bytes.decode('utf-8')
3368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3372 result = re.search(self.VIDEO_URL_RE, webpage)
3374 self._downloader.trouble(u'ERROR: unable to extract video url')
3376 video_url = compat_urllib_parse.unquote(result.group(1))
3378 result = re.search(self.VIDEO_TITLE_RE, webpage)
3380 self._downloader.trouble(u'ERROR: unable to extract video title')
3382 video_title = result.group(1)
3384 result = re.search(self.VIDEO_THUMB_RE, webpage)
3386 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3388 video_thumbnail = result.group(1)
3394 'upload_date': None,
3395 'title': video_title,
3397 'thumbnail': video_thumbnail,
3398 'description': None,
3402 class GooglePlusIE(InfoExtractor):
3403 """Information extractor for plus.google.com."""
3405 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3406 IE_NAME = u'plus.google'
3408 def __init__(self, downloader=None):
3409 InfoExtractor.__init__(self, downloader)
3411 def report_extract_entry(self, url):
3412 """Report downloading extry"""
3413 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3415 def report_date(self, upload_date):
3416 """Report downloading extry"""
3417 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3419 def report_uploader(self, uploader):
3420 """Report downloading extry"""
3421 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3423 def report_title(self, video_title):
3424 """Report downloading extry"""
3425 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3427 def report_extract_vid_page(self, video_page):
3428 """Report information extraction."""
3429 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3431 def _real_extract(self, url):
3432 # Extract id from URL
3433 mobj = re.match(self._VALID_URL, url)
3435 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3438 post_url = mobj.group(0)
3439 video_id = mobj.group(1)
3441 video_extension = 'flv'
3443 # Step 1, Retrieve post webpage to extract further information
3444 self.report_extract_entry(post_url)
3445 request = compat_urllib_request.Request(post_url)
3447 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3448 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3449 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3452 # Extract update date
3454 pattern = 'title="Timestamp">(.*?)</a>'
3455 mobj = re.search(pattern, webpage)
3457 upload_date = mobj.group(1)
3458 # Convert timestring to a format suitable for filename
3459 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3460 upload_date = upload_date.strftime('%Y%m%d')
3461 self.report_date(upload_date)
3465 pattern = r'rel\="author".*?>(.*?)</a>'
3466 mobj = re.search(pattern, webpage)
3468 uploader = mobj.group(1)
3469 self.report_uploader(uploader)
3472 # Get the first line for title
3474 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3475 mobj = re.search(pattern, webpage)
3477 video_title = mobj.group(1)
3478 self.report_title(video_title)
3480 # Step 2, Stimulate clicking the image box to launch video
3481 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3482 mobj = re.search(pattern, webpage)
3484 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3486 video_page = mobj.group(1)
3487 request = compat_urllib_request.Request(video_page)
3489 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3491 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3493 self.report_extract_vid_page(video_page)
3496 # Extract video links on video page
3497 """Extract video links of all sizes"""
3498 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3499 mobj = re.findall(pattern, webpage)
3501 self._downloader.trouble(u'ERROR: unable to extract video links')
3503 # Sort in resolution
3504 links = sorted(mobj)
3506 # Choose the lowest of the sort, i.e. highest resolution
3507 video_url = links[-1]
3508 # Only get the url. The resolution part in the tuple has no use anymore
3509 video_url = video_url[-1]
3510 # Treat escaped \u0026 style hex
3512 video_url = video_url.decode("unicode_escape")
3513 except AttributeError: # Python 3
3514 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3520 'uploader': uploader,
3521 'upload_date': upload_date,
3522 'title': video_title,
3523 'ext': video_extension,
3526 class NBAIE(InfoExtractor):
3527 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3530 def report_extraction(self, video_id):
3531 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3533 def _real_extract(self, url):
3534 mobj = re.match(self._VALID_URL, url)
3536 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3539 video_id = mobj.group(1)
3540 if video_id.endswith('/index.html'):
3541 video_id = video_id[:-len('/index.html')]
3543 self.report_extraction(video_id)
3545 urlh = compat_urllib_request.urlopen(url)
3546 webpage_bytes = urlh.read()
3547 webpage = webpage_bytes.decode('utf-8', 'ignore')
3548 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3549 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3552 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3553 def _findProp(rexp, default=None):
3554 m = re.search(rexp, webpage)
3556 return unescapeHTML(m.group(1))
3560 shortened_video_id = video_id.rpartition('/')[2]
3561 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3563 'id': shortened_video_id,
3567 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3568 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3572 class JustinTVIE(InfoExtractor):
3573 """Information extractor for justin.tv and twitch.tv"""
3574 # TODO: One broadcast may be split into multiple videos. The key
3575 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3576 # starts at 1 and increases. Can we treat all parts as one video?
3578 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3579 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3580 _JUSTIN_PAGE_LIMIT = 100
3581 IE_NAME = u'justin.tv'
3583 def report_extraction(self, file_id):
3584 """Report information extraction."""
3585 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3587 def report_download_page(self, channel, offset):
3588 """Report attempt to download a single page of videos."""
3589 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3590 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3592 # Return count of items, list of *valid* items
3593 def _parse_page(self, url):
3595 urlh = compat_urllib_request.urlopen(url)
3596 webpage_bytes = urlh.read()
3597 webpage = webpage_bytes.decode('utf-8', 'ignore')
3598 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3599 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3602 response = json.loads(webpage)
3604 for clip in response:
3605 video_url = clip['video_file_url']
3607 video_extension = os.path.splitext(video_url)[1][1:]
3608 video_date = re.sub('-', '', clip['created_on'][:10])
3612 'title': clip['title'],
3613 'uploader': clip.get('user_id', clip.get('channel_id')),
3614 'upload_date': video_date,
3615 'ext': video_extension,
3617 return (len(response), info)
3619 def _real_extract(self, url):
3620 mobj = re.match(self._VALID_URL, url)
3622 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3625 api = 'http://api.justin.tv'
3626 video_id = mobj.group(mobj.lastindex)
3628 if mobj.lastindex == 1:
3630 api += '/channel/archives/%s.json'
3632 api += '/clip/show/%s.json'
3633 api = api % (video_id,)
3635 self.report_extraction(video_id)
3639 limit = self._JUSTIN_PAGE_LIMIT
3642 self.report_download_page(video_id, offset)
3643 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3644 page_count, page_info = self._parse_page(page_url)
3645 info.extend(page_info)
3646 if not paged or page_count != limit:
3651 class FunnyOrDieIE(InfoExtractor):
3652 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3653 IE_NAME = u'FunnyOrDie'
3655 def report_extraction(self, video_id):
3656 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3658 def _real_extract(self, url):
3659 mobj = re.match(self._VALID_URL, url)
3661 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3664 video_id = mobj.group('id')
3665 self.report_extraction(video_id)
3667 urlh = compat_urllib_request.urlopen(url)
3668 webpage_bytes = urlh.read()
3669 webpage = webpage_bytes.decode('utf-8', 'ignore')
3670 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3671 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3674 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3676 self._downloader.trouble(u'ERROR: unable to find video information')
3677 video_url = unescapeHTML(m.group('url'))
3679 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3681 self._downloader.trouble(u'Cannot find video title')
3682 title = unescapeHTML(m.group('title'))
3684 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3686 desc = unescapeHTML(m.group('desc'))
3695 'description': desc,
3699 class TweetReelIE(InfoExtractor):
3700 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3702 def report_extraction(self, video_id):
3703 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3705 def _real_extract(self, url):
3706 mobj = re.match(self._VALID_URL, url)
3708 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3711 video_id = mobj.group('id')
3712 self.report_extraction(video_id)
3714 urlh = compat_urllib_request.urlopen(url)
3715 webpage_bytes = urlh.read()
3716 webpage = webpage_bytes.decode('utf-8', 'ignore')
3717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3718 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3721 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3723 self._downloader.trouble(u'ERROR: Cannot find status ID')
3724 status_id = m.group(1)
3726 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3728 self._downloader.trouble(u'WARNING: Cannot find description')
3729 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3731 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3733 self._downloader.trouble(u'ERROR: Cannot find uploader')
3734 uploader = unescapeHTML(m.group('uploader'))
3735 uploader_id = unescapeHTML(m.group('uploader_id'))
3737 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3739 self._downloader.trouble(u'ERROR: Cannot find upload date')
3740 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3743 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3750 'description': desc,
3751 'uploader': uploader,
3752 'uploader_id': uploader_id,
3753 'internal_id': status_id,
3754 'upload_date': upload_date