2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self).__name__[:-2]
108 class YoutubeIE(InfoExtractor):
109 """Information extractor for youtube.com."""
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
131 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE = 'youtube'
136 # Listed in order of quality
137 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions = {
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
151 _video_dimensions = {
169 def suitable(self, url):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
173 def report_lang(self):
174 """Report attempt to set language."""
175 self._downloader.to_screen(u'[youtube] Setting language')
177 def report_login(self):
178 """Report attempt to log in."""
179 self._downloader.to_screen(u'[youtube] Logging in')
181 def report_age_confirmation(self):
182 """Report attempt to confirm age."""
183 self._downloader.to_screen(u'[youtube] Confirming age')
185 def report_video_webpage_download(self, video_id):
186 """Report attempt to download video webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
189 def report_video_info_webpage_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
193 def report_video_subtitles_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
197 def report_information_extraction(self, video_id):
198 """Report attempt to extract video information."""
199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
201 def report_unavailable_format(self, video_id, format):
202 """Report extracted video URL."""
203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
205 def report_rtmp_download(self):
206 """Indicate the download will use the RTMP protocol."""
207 self._downloader.to_screen(u'[youtube] RTMP download detected')
209 def _closed_captions_xml_to_srt(self, xml_string):
211 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212 # TODO parse xml instead of regex
213 for n, (start, dur_tag, dur, caption) in enumerate(texts):
214 if not dur: dur = '4'
216 end = start + float(dur)
217 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219 caption = unescapeHTML(caption)
220 caption = unescapeHTML(caption) # double cycle, intentional
221 srt += str(n+1) + '\n'
222 srt += start + ' --> ' + end + '\n'
223 srt += caption + '\n\n'
226 def _extract_subtitles(self, video_id):
227 self.report_video_subtitles_download(video_id)
228 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
230 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235 if not srt_lang_list:
236 return (u'WARNING: video has no closed captions', None)
237 if self._downloader.params.get('subtitleslang', False):
238 srt_lang = self._downloader.params.get('subtitleslang')
239 elif 'en' in srt_lang_list:
242 srt_lang = list(srt_lang_list.keys())[0]
243 if not srt_lang in srt_lang_list:
244 return (u'WARNING: no closed captions found in the specified language', None)
245 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
247 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
251 return (u'WARNING: unable to download video subtitles', None)
252 return (None, self._closed_captions_xml_to_srt(srt_xml))
254 def _print_formats(self, formats):
255 print('Available formats:')
257 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
259 def _real_initialize(self):
260 if self._downloader is None:
265 downloader_params = self._downloader.params
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params.get('username', None) is not None:
269 username = downloader_params['username']
270 password = downloader_params['password']
271 elif downloader_params.get('usenetrc', False):
273 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
278 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279 except (IOError, netrc.NetrcParseError) as err:
280 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
284 request = compat_urllib_request.Request(self._LANG_URL)
287 compat_urllib_request.urlopen(request).read()
288 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
292 # No authentication to be performed
298 'current_form': 'loginForm',
300 'action_login': 'Log In',
301 'username': username,
302 'password': password,
304 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
307 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
318 'action_confirm': 'Confirm',
320 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
322 self.report_age_confirmation()
323 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
328 def _extract_id(self, url):
329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
333 video_id = mobj.group(2)
336 def _real_extract(self, url):
337 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
338 mobj = re.search(self._NEXT_URL_RE, url)
340 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
341 video_id = self._extract_id(url)
344 self.report_video_webpage_download(video_id)
345 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
346 request = compat_urllib_request.Request(url)
348 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
349 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
353 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
355 # Attempt to extract SWF player URL
356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
363 self.report_video_info_webpage_download(video_id)
364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
366 % (video_id, el_type))
367 request = compat_urllib_request.Request(video_info_url)
369 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
371 video_info = compat_parse_qs(video_info_webpage)
372 if 'token' in video_info:
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
377 if 'token' not in video_info:
378 if 'reason' in video_info:
379 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
381 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
384 # Check for "rental" videos
385 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
386 self._downloader.trouble(u'ERROR: "rental" videos not supported')
389 # Start extracting information
390 self.report_information_extraction(video_id)
393 if 'author' not in video_info:
394 self._downloader.trouble(u'ERROR: unable to extract uploader name')
396 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
399 video_uploader_id = None
400 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
402 video_uploader_id = mobj.group(1)
404 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
407 if 'title' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract video title')
410 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
413 if 'thumbnail_url' not in video_info:
414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
416 else: # don't panic if we can't find it
417 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
425 for expression in format_expressions:
427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
432 video_description = get_element_by_id("eow-description", video_webpage)
433 if video_description:
434 video_description = clean_html(video_description)
436 video_description = ''
439 video_subtitles = None
440 if self._downloader.params.get('writesubtitles', False):
441 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
443 self._downloader.trouble(srt_error)
445 if 'length_seconds' not in video_info:
446 self._downloader.trouble(u'WARNING: unable to extract video duration')
449 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
452 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
454 # Decide which formats to download
455 req_format = self._downloader.params.get('format', None)
457 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
458 self.report_rtmp_download()
459 video_url_list = [(None, video_info['conn'][0])]
460 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
461 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
462 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
463 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
464 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
466 format_limit = self._downloader.params.get('format_limit', None)
467 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
468 if format_limit is not None and format_limit in available_formats:
469 format_list = available_formats[available_formats.index(format_limit):]
471 format_list = available_formats
472 existing_formats = [x for x in format_list if x in url_map]
473 if len(existing_formats) == 0:
474 self._downloader.trouble(u'ERROR: no known formats available for video')
476 if self._downloader.params.get('listformats', None):
477 self._print_formats(existing_formats)
479 if req_format is None or req_format == 'best':
480 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
481 elif req_format == 'worst':
482 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
483 elif req_format in ('-1', 'all'):
484 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
486 # Specific formats. We pick the first in a slash-delimeted sequence.
487 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
488 req_formats = req_format.split('/')
489 video_url_list = None
490 for rf in req_formats:
492 video_url_list = [(rf, url_map[rf])]
494 if video_url_list is None:
495 self._downloader.trouble(u'ERROR: requested format not available')
498 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
502 for format_param, video_real_url in video_url_list:
504 video_extension = self._video_extensions.get(format_param, 'flv')
506 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
507 self._video_dimensions.get(format_param, '???'))
511 'url': video_real_url,
512 'uploader': video_uploader,
513 'uploader_id': video_uploader_id,
514 'upload_date': upload_date,
515 'title': video_title,
516 'ext': video_extension,
517 'format': video_format,
518 'thumbnail': video_thumbnail,
519 'description': video_description,
520 'player_url': player_url,
521 'subtitles': video_subtitles,
522 'duration': video_duration
527 class MetacafeIE(InfoExtractor):
528 """Information Extractor for metacafe.com."""
530 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
531 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
532 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
533 IE_NAME = u'metacafe'
535 def __init__(self, downloader=None):
536 InfoExtractor.__init__(self, downloader)
538 def report_disclaimer(self):
539 """Report disclaimer retrieval."""
540 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
542 def report_age_confirmation(self):
543 """Report attempt to confirm age."""
544 self._downloader.to_screen(u'[metacafe] Confirming age')
546 def report_download_webpage(self, video_id):
547 """Report webpage download."""
548 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
550 def report_extraction(self, video_id):
551 """Report information extraction."""
552 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
554 def _real_initialize(self):
555 # Retrieve disclaimer
556 request = compat_urllib_request.Request(self._DISCLAIMER)
558 self.report_disclaimer()
559 disclaimer = compat_urllib_request.urlopen(request).read()
560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
561 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
567 'submit': "Continue - I'm over 18",
569 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
571 self.report_age_confirmation()
572 disclaimer = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
577 def _real_extract(self, url):
578 # Extract id and simplified title from URL
579 mobj = re.match(self._VALID_URL, url)
581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
584 video_id = mobj.group(1)
586 # Check if video comes from YouTube
587 mobj2 = re.match(r'^yt-(.*)$', video_id)
588 if mobj2 is not None:
589 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
592 # Retrieve video webpage to extract further information
593 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
595 self.report_download_webpage(video_id)
596 webpage = compat_urllib_request.urlopen(request).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
601 # Extract URL, uploader and title from webpage
602 self.report_extraction(video_id)
603 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
605 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
606 video_extension = mediaURL[-3:]
608 # Extract gdaKey if available
609 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
613 gdaKey = mobj.group(1)
614 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
616 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract media URL')
620 vardict = compat_parse_qs(mobj.group(1))
621 if 'mediaData' not in vardict:
622 self._downloader.trouble(u'ERROR: unable to extract media URL')
624 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
628 mediaURL = mobj.group(1).replace('\\/', '/')
629 video_extension = mediaURL[-3:]
630 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
632 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
634 self._downloader.trouble(u'ERROR: unable to extract title')
636 video_title = mobj.group(1).decode('utf-8')
638 mobj = re.search(r'submitter=(.*?);', webpage)
640 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
642 video_uploader = mobj.group(1)
645 'id': video_id.decode('utf-8'),
646 'url': video_url.decode('utf-8'),
647 'uploader': video_uploader.decode('utf-8'),
649 'title': video_title,
650 'ext': video_extension.decode('utf-8'),
654 class DailymotionIE(InfoExtractor):
655 """Information Extractor for Dailymotion"""
657 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
658 IE_NAME = u'dailymotion'
660 def __init__(self, downloader=None):
661 InfoExtractor.__init__(self, downloader)
663 def report_download_webpage(self, video_id):
664 """Report webpage download."""
665 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
667 def report_extraction(self, video_id):
668 """Report information extraction."""
669 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
671 def _real_extract(self, url):
672 # Extract id and simplified title from URL
673 mobj = re.match(self._VALID_URL, url)
675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
678 video_id = mobj.group(1).split('_')[0].split('?')[0]
680 video_extension = 'mp4'
682 # Retrieve video webpage to extract further information
683 request = compat_urllib_request.Request(url)
684 request.add_header('Cookie', 'family_filter=off')
686 self.report_download_webpage(video_id)
687 webpage_bytes = compat_urllib_request.urlopen(request).read()
688 webpage = webpage_bytes.decode('utf-8')
689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
690 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
693 # Extract URL, uploader and title from webpage
694 self.report_extraction(video_id)
695 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
697 self._downloader.trouble(u'ERROR: unable to extract media URL')
699 flashvars = compat_urllib_parse.unquote(mobj.group(1))
701 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
704 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
707 self._downloader.trouble(u'ERROR: unable to extract video URL')
710 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
715 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
717 # TODO: support choosing qualities
719 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
721 self._downloader.trouble(u'ERROR: unable to extract title')
723 video_title = unescapeHTML(mobj.group('title'))
725 video_uploader = None
726 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
728 # lookin for official user
729 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
730 if mobj_official is None:
731 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
733 video_uploader = mobj_official.group(1)
735 video_uploader = mobj.group(1)
737 video_upload_date = None
738 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
740 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
745 'uploader': video_uploader,
746 'upload_date': video_upload_date,
747 'title': video_title,
748 'ext': video_extension,
752 class PhotobucketIE(InfoExtractor):
753 """Information extractor for photobucket.com."""
755 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
756 IE_NAME = u'photobucket'
758 def __init__(self, downloader=None):
759 InfoExtractor.__init__(self, downloader)
761 def report_download_webpage(self, video_id):
762 """Report webpage download."""
763 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
765 def report_extraction(self, video_id):
766 """Report information extraction."""
767 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
769 def _real_extract(self, url):
770 # Extract id from URL
771 mobj = re.match(self._VALID_URL, url)
773 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
776 video_id = mobj.group(1)
778 video_extension = 'flv'
780 # Retrieve video webpage to extract further information
781 request = compat_urllib_request.Request(url)
783 self.report_download_webpage(video_id)
784 webpage = compat_urllib_request.urlopen(request).read()
785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
786 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
789 # Extract URL, uploader, and title from webpage
790 self.report_extraction(video_id)
791 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
793 self._downloader.trouble(u'ERROR: unable to extract media URL')
795 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
799 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
801 self._downloader.trouble(u'ERROR: unable to extract title')
803 video_title = mobj.group(1).decode('utf-8')
805 video_uploader = mobj.group(2).decode('utf-8')
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
810 'uploader': video_uploader,
812 'title': video_title,
813 'ext': video_extension.decode('utf-8'),
817 class YahooIE(InfoExtractor):
818 """Information extractor for video.yahoo.com."""
821 # _VALID_URL matches all Yahoo! Video URLs
822 # _VPAGE_URL matches only the extractable '/watch/' URLs
823 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
824 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
825 IE_NAME = u'video.yahoo'
827 def __init__(self, downloader=None):
828 InfoExtractor.__init__(self, downloader)
830 def report_download_webpage(self, video_id):
831 """Report webpage download."""
832 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
834 def report_extraction(self, video_id):
835 """Report information extraction."""
836 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
838 def _real_extract(self, url, new_video=True):
839 # Extract ID from URL
840 mobj = re.match(self._VALID_URL, url)
842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
845 video_id = mobj.group(2)
846 video_extension = 'flv'
848 # Rewrite valid but non-extractable URLs as
849 # extractable English language /watch/ URLs
850 if re.match(self._VPAGE_URL, url) is None:
851 request = compat_urllib_request.Request(url)
853 webpage = compat_urllib_request.urlopen(request).read()
854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
855 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
858 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
860 self._downloader.trouble(u'ERROR: Unable to extract id field')
862 yahoo_id = mobj.group(1)
864 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
866 self._downloader.trouble(u'ERROR: Unable to extract vid field')
868 yahoo_vid = mobj.group(1)
870 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
871 return self._real_extract(url, new_video=False)
873 # Retrieve video webpage to extract further information
874 request = compat_urllib_request.Request(url)
876 self.report_download_webpage(video_id)
877 webpage = compat_urllib_request.urlopen(request).read()
878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
882 # Extract uploader and title from webpage
883 self.report_extraction(video_id)
884 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
886 self._downloader.trouble(u'ERROR: unable to extract video title')
888 video_title = mobj.group(1).decode('utf-8')
890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
894 video_uploader = mobj.group(1).decode('utf-8')
896 # Extract video thumbnail
897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
901 video_thumbnail = mobj.group(1).decode('utf-8')
903 # Extract video description
904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video description')
908 video_description = mobj.group(1).decode('utf-8')
909 if not video_description:
910 video_description = 'No description available.'
912 # Extract video height and width
913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
915 self._downloader.trouble(u'ERROR: unable to extract video height')
917 yv_video_height = mobj.group(1)
919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video width')
923 yv_video_width = mobj.group(1)
925 # Retrieve video playlist to extract media URL
926 # I'm not completely sure what all these options are, but we
927 # seem to need most of them, otherwise the server sends a 401.
928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
930 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
934 self.report_download_webpage(video_id)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
940 # Extract media URL from playlist XML
941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
945 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
946 video_url = unescapeHTML(video_url)
949 'id': video_id.decode('utf-8'),
951 'uploader': video_uploader,
953 'title': video_title,
954 'ext': video_extension.decode('utf-8'),
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description,
960 class VimeoIE(InfoExtractor):
961 """Information extractor for vimeo.com."""
963 # _VALID_URL matches Vimeo URLs
964 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
967 def __init__(self, downloader=None):
968 InfoExtractor.__init__(self, downloader)
970 def report_download_webpage(self, video_id):
971 """Report webpage download."""
972 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
974 def report_extraction(self, video_id):
975 """Report information extraction."""
976 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
978 def _real_extract(self, url, new_video=True):
979 # Extract ID from URL
980 mobj = re.match(self._VALID_URL, url)
982 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
985 video_id = mobj.group(1)
987 # Retrieve video webpage to extract further information
988 request = compat_urllib_request.Request(url, None, std_headers)
990 self.report_download_webpage(video_id)
991 webpage_bytes = compat_urllib_request.urlopen(request).read()
992 webpage = webpage_bytes.decode('utf-8')
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
997 # Now we begin extracting as much information as we can from what we
998 # retrieved. First we extract the information common to all extractors,
999 # and latter we extract those that are Vimeo specific.
1000 self.report_extraction(video_id)
1002 # Extract the config JSON
1004 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005 config = json.loads(config)
1007 self._downloader.trouble(u'ERROR: unable to extract info section')
1011 video_title = config["video"]["title"]
1013 # Extract uploader and uploader_id
1014 video_uploader = config["video"]["owner"]["name"]
1015 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1017 # Extract video thumbnail
1018 video_thumbnail = config["video"]["thumbnail"]
1020 # Extract video description
1021 video_description = get_element_by_attribute("itemprop", "description", webpage)
1022 if video_description: video_description = clean_html(video_description)
1023 else: video_description = ''
1025 # Extract upload date
1026 video_upload_date = None
1027 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028 if mobj is not None:
1029 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1031 # Vimeo specific: extract request signature and timestamp
1032 sig = config['request']['signature']
1033 timestamp = config['request']['timestamp']
1035 # Vimeo specific: extract video codec and quality information
1036 # First consider quality, then codecs, then take everything
1037 # TODO bind to format param
1038 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039 files = { 'hd': [], 'sd': [], 'other': []}
1040 for codec_name, codec_extension in codecs:
1041 if codec_name in config["video"]["files"]:
1042 if 'hd' in config["video"]["files"][codec_name]:
1043 files['hd'].append((codec_name, codec_extension, 'hd'))
1044 elif 'sd' in config["video"]["files"][codec_name]:
1045 files['sd'].append((codec_name, codec_extension, 'sd'))
1047 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1049 for quality in ('hd', 'sd', 'other'):
1050 if len(files[quality]) > 0:
1051 video_quality = files[quality][0][2]
1052 video_codec = files[quality][0][0]
1053 video_extension = files[quality][0][1]
1054 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1057 self._downloader.trouble(u'ERROR: no known codec found')
1060 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1066 'uploader': video_uploader,
1067 'uploader_id': video_uploader_id,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1076 class ArteTvIE(InfoExtractor):
1077 """arte.tv information extractor."""
1079 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080 _LIVE_URL = r'index-[0-9]+\.html$'
1082 IE_NAME = u'arte.tv'
1084 def __init__(self, downloader=None):
1085 InfoExtractor.__init__(self, downloader)
1087 def report_download_webpage(self, video_id):
1088 """Report webpage download."""
1089 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1095 def fetch_webpage(self, url):
1096 request = compat_urllib_request.Request(url)
1098 self.report_download_webpage(url)
1099 webpage = compat_urllib_request.urlopen(request).read()
1100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1103 except ValueError as err:
1104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1108 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109 page = self.fetch_webpage(url)
1110 mobj = re.search(regex, page, regexFlags)
1114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1117 for (i, key, err) in matchTuples:
1118 if mobj.group(i) is None:
1119 self._downloader.trouble(err)
1122 info[key] = mobj.group(i)
1126 def extractLiveStream(self, url):
1127 video_lang = url.split('/')[-4]
1128 info = self.grep_webpage(
1130 r'src="(.*?/videothek_js.*?\.js)',
1133 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1136 http_host = url.split('/')[2]
1137 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138 info = self.grep_webpage(
1140 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141 '(http://.*?\.swf).*?' +
1145 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1146 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1150 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1152 def extractPlus7Stream(self, url):
1153 video_lang = url.split('/')[-3]
1154 info = self.grep_webpage(
1156 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1159 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1162 next_url = compat_urllib_parse.unquote(info.get('url'))
1163 info = self.grep_webpage(
1165 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1168 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1171 next_url = compat_urllib_parse.unquote(info.get('url'))
1173 info = self.grep_webpage(
1175 r'<video id="(.*?)".*?>.*?' +
1176 '<name>(.*?)</name>.*?' +
1177 '<dateVideo>(.*?)</dateVideo>.*?' +
1178 '<url quality="hd">(.*?)</url>',
1181 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1182 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1184 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1189 'id': info.get('id'),
1190 'url': compat_urllib_parse.unquote(info.get('url')),
1191 'uploader': u'arte.tv',
1192 'upload_date': info.get('date'),
1193 'title': info.get('title').decode('utf-8'),
1199 def _real_extract(self, url):
1200 video_id = url.split('/')[-1]
1201 self.report_extraction(video_id)
1203 if re.search(self._LIVE_URL, video_id) is not None:
1204 self.extractLiveStream(url)
1207 info = self.extractPlus7Stream(url)
1212 class GenericIE(InfoExtractor):
1213 """Generic last-resort information extractor."""
1216 IE_NAME = u'generic'
1218 def __init__(self, downloader=None):
1219 InfoExtractor.__init__(self, downloader)
1221 def report_download_webpage(self, video_id):
1222 """Report webpage download."""
1223 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1230 def report_following_redirect(self, new_url):
1231 """Report information extraction."""
1232 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1234 def _test_redirect(self, url):
1235 """Check if it is a redirect, like url shorteners, in case restart chain."""
1236 class HeadRequest(compat_urllib_request.Request):
1237 def get_method(self):
1240 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1242 Subclass the HTTPRedirectHandler to make it use our
1243 HeadRequest also on the redirected URL
1245 def redirect_request(self, req, fp, code, msg, headers, newurl):
1246 if code in (301, 302, 303, 307):
1247 newurl = newurl.replace(' ', '%20')
1248 newheaders = dict((k,v) for k,v in req.headers.items()
1249 if k.lower() not in ("content-length", "content-type"))
1250 return HeadRequest(newurl,
1252 origin_req_host=req.get_origin_req_host(),
1255 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1257 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1259 Fallback to GET if HEAD is not allowed (405 HTTP error)
1261 def http_error_405(self, req, fp, code, msg, headers):
1265 newheaders = dict((k,v) for k,v in req.headers.items()
1266 if k.lower() not in ("content-length", "content-type"))
1267 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1269 origin_req_host=req.get_origin_req_host(),
1273 opener = compat_urllib_request.OpenerDirector()
1274 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275 HTTPMethodFallback, HEADRedirectHandler,
1276 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277 opener.add_handler(handler())
1279 response = opener.open(HeadRequest(url))
1280 new_url = response.geturl()
1285 self.report_following_redirect(new_url)
1286 self._downloader.download([new_url])
1289 def _real_extract(self, url):
1290 if self._test_redirect(url): return
1292 video_id = url.split('/')[-1]
1293 request = compat_urllib_request.Request(url)
1295 self.report_download_webpage(video_id)
1296 webpage = compat_urllib_request.urlopen(request).read()
1297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1300 except ValueError as err:
1301 # since this is the last-resort InfoExtractor, if
1302 # this error is thrown, it'll be thrown here
1303 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306 self.report_extraction(video_id)
1307 # Start with something easy: JW Player in SWFObject
1308 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1310 # Broaden the search a little bit
1311 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1316 # It's possible that one of the regexes
1317 # matched, but returned an empty group:
1318 if mobj.group(1) is None:
1319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1322 video_url = compat_urllib_parse.unquote(mobj.group(1))
1323 video_id = os.path.basename(video_url)
1325 # here's a fun little line of code for you:
1326 video_extension = os.path.splitext(video_id)[1][1:]
1327 video_id = os.path.splitext(video_id)[0]
1329 # it's tempting to parse this further, but you would
1330 # have to take into account all the variations like
1331 # Video Title - Site Name
1332 # Site Name | Video Title
1333 # Video Title - Tagline | Site Name
1334 # and so on and so forth; it's just not practical
1335 mobj = re.search(r'<title>(.*)</title>', webpage)
1337 self._downloader.trouble(u'ERROR: unable to extract title')
1339 video_title = mobj.group(1)
1341 # video uploader is domain name
1342 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1344 self._downloader.trouble(u'ERROR: unable to extract title')
1346 video_uploader = mobj.group(1)
1351 'uploader': video_uploader,
1352 'upload_date': None,
1353 'title': video_title,
1354 'ext': video_extension,
1358 class YoutubeSearchIE(InfoExtractor):
1359 """Information Extractor for YouTube search queries."""
1360 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362 _max_youtube_results = 1000
1363 IE_NAME = u'youtube:search'
1365 def __init__(self, downloader=None):
1366 InfoExtractor.__init__(self, downloader)
1368 def report_download_page(self, query, pagenum):
1369 """Report attempt to download search page with given number."""
1370 query = query.decode(preferredencoding())
1371 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1373 def _real_extract(self, query):
1374 mobj = re.match(self._VALID_URL, query)
1376 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1379 prefix, query = query.split(':')
1381 query = query.encode('utf-8')
1383 self._download_n_results(query, 1)
1385 elif prefix == 'all':
1386 self._download_n_results(query, self._max_youtube_results)
1392 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1394 elif n > self._max_youtube_results:
1395 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396 n = self._max_youtube_results
1397 self._download_n_results(query, n)
1399 except ValueError: # parsing prefix as integer fails
1400 self._download_n_results(query, 1)
1403 def _download_n_results(self, query, n):
1404 """Downloads a specified number of results for a query"""
1410 while (50 * pagenum) < limit:
1411 self.report_download_page(query, pagenum+1)
1412 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413 request = compat_urllib_request.Request(result_url)
1415 data = compat_urllib_request.urlopen(request).read()
1416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1419 api_response = json.loads(data)['data']
1421 new_ids = list(video['id'] for video in api_response['items'])
1422 video_ids += new_ids
1424 limit = min(n, api_response['totalItems'])
1427 if len(video_ids) > n:
1428 video_ids = video_ids[:n]
1429 for id in video_ids:
1430 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1434 class GoogleSearchIE(InfoExtractor):
1435 """Information Extractor for Google Video search queries."""
1436 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440 _max_google_results = 1000
1441 IE_NAME = u'video.google:search'
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download playlist page with given number."""
1448 query = query.decode(preferredencoding())
1449 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1457 prefix, query = query.split(':')
1459 query = query.encode('utf-8')
1461 self._download_n_results(query, 1)
1463 elif prefix == 'all':
1464 self._download_n_results(query, self._max_google_results)
1470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1472 elif n > self._max_google_results:
1473 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474 n = self._max_google_results
1475 self._download_n_results(query, n)
1477 except ValueError: # parsing prefix as integer fails
1478 self._download_n_results(query, 1)
1481 def _download_n_results(self, query, n):
1482 """Downloads a specified number of results for a query"""
1488 self.report_download_page(query, pagenum)
1489 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490 request = compat_urllib_request.Request(result_url)
1492 page = compat_urllib_request.urlopen(request).read()
1493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1497 # Extract video identifiers
1498 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499 video_id = mobj.group(1)
1500 if video_id not in video_ids:
1501 video_ids.append(video_id)
1502 if len(video_ids) == n:
1503 # Specified n videos reached
1504 for id in video_ids:
1505 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509 for id in video_ids:
1510 self._downloader#!/usr/bin/env python
1511 # -*- coding: utf-8 -*-
1513 from __future__ import absolute_import
1522 import xml.etree.ElementTree
1526 from .utils import *
1529 class InfoExtractor(object):
1530 """Information Extractor class.
1532 Information extractors are the classes that, given a URL, extract
1533 information about the video (or videos) the URL refers to. This
1534 information includes the real video URL, the video title, author and
1535 others. The information is stored in a dictionary which is then
1536 passed to the FileDownloader. The FileDownloader processes this
1537 information possibly downloading the video to the file system, among
1538 other possible outcomes.
1540 The dictionaries must include the following fields:
1542 id: Video identifier.
1543 url: Final video URL.
1544 title: Video title, unescaped.
1545 ext: Video filename extension.
1546 uploader: Full name of the video uploader.
1547 upload_date: Video upload date (YYYYMMDD).
1549 The following fields are optional:
1551 format: The video format, defaults to ext (used for --get-format)
1552 thumbnail: Full URL to a video thumbnail image.
1553 description: One-line video description.
1554 uploader_id: Nickname or id of the video uploader.
1555 player_url: SWF Player URL (used for rtmpdump).
1556 subtitles: The .srt file contents.
1557 urlhandle: [internal] The urlHandle to be used to download the file,
1558 like returned by urllib.request.urlopen
1560 The fields should all be Unicode strings.
1562 Subclasses of this one should re-define the _real_initialize() and
1563 _real_extract() methods and define a _VALID_URL regexp.
1564 Probably, they should also be added to the list of extractors.
1566 _real_extract() must return a *list* of information dictionaries as
1569 Finally, the _WORKING attribute should be set to False for broken IEs
1570 in order to warn the users and skip the tests.
1577 def __init__(self, downloader=None):
1578 """Constructor. Receives an optional downloader."""
1580 self.set_downloader(downloader)
1582 def suitable(self, url):
1583 """Receives a URL and returns True if suitable for this IE."""
1584 return re.match(self._VALID_URL, url) is not None
1587 """Getter method for _WORKING."""
1588 return self._WORKING
1590 def initialize(self):
1591 """Initializes an instance (authentication, etc)."""
1593 self._real_initialize()
1596 def extract(self, url):
1597 """Extracts URL information and returns it in list of dicts."""
1599 return self._real_extract(url)
1601 def set_downloader(self, downloader):
1602 """Sets the downloader for this IE."""
1603 self._downloader = downloader
1605 def _real_initialize(self):
1606 """Real initialization process. Redefine in subclasses."""
1609 def _real_extract(self, url):
1610 """Real extraction process. Redefine in subclasses."""
1615 return type(self).__name__[:-2]
1617 class YoutubeIE(InfoExtractor):
1618 """Information extractor for youtube.com."""
1622 (?:https?://)? # http(s):// (optional)
1623 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
1624 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
1625 (?:.*?\#/)? # handle anchor (#/) redirect urls
1626 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
1627 (?: # the various things that can precede the ID:
1628 (?:(?:v|embed|e)/) # v/ or embed/ or e/
1629 |(?: # or the v= param in all its forms
1630 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
1631 (?:\?|\#!?) # the params delimiter ? or # or #!
1632 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
1635 )? # optional -> youtube.com/xxxx is OK
1636 )? # all until now is optional -> you can pass the naked ID
1637 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
1638 (?(1).+)? # if we found the ID, everything can follow
1640 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1641 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1642 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1643 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
1644 _NETRC_MACHINE = 'youtube'
1645 # Listed in order of quality
1646 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1647 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1648 _video_extensions = {
1654 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1660 _video_dimensions = {
1676 IE_NAME = u'youtube'
1678 def suitable(self, url):
1679 """Receives a URL and returns True if suitable for this IE."""
1680 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
1682 def report_lang(self):
1683 """Report attempt to set language."""
1684 self._downloader.to_screen(u'[youtube] Setting language')
1686 def report_login(self):
1687 """Report attempt to log in."""
1688 self._downloader.to_screen(u'[youtube] Logging in')
1690 def report_age_confirmation(self):
1691 """Report attempt to confirm age."""
1692 self._downloader.to_screen(u'[youtube] Confirming age')
1694 def report_video_webpage_download(self, video_id):
1695 """Report attempt to download video webpage."""
1696 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1698 def report_video_info_webpage_download(self, video_id):
1699 """Report attempt to download video info webpage."""
1700 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1702 def report_video_subtitles_download(self, video_id):
1703 """Report attempt to download video info webpage."""
1704 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
1706 def report_information_extraction(self, video_id):
1707 """Report attempt to extract video information."""
1708 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1710 def report_unavailable_format(self, video_id, format):
1711 """Report extracted video URL."""
1712 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1714 def report_rtmp_download(self):
1715 """Indicate the download will use the RTMP protocol."""
1716 self._downloader.to_screen(u'[youtube] RTMP download detected')
1718 def _closed_captions_xml_to_srt(self, xml_string):
1720 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
1721 # TODO parse xml instead of regex
1722 for n, (start, dur_tag, dur, caption) in enumerate(texts):
1723 if not dur: dur = '4'
1724 start = float(start)
1725 end = start + float(dur)
1726 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
1727 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
1728 caption = unescapeHTML(caption)
1729 caption = unescapeHTML(caption) # double cycle, intentional
1730 srt += str(n+1) + '\n'
1731 srt += start + ' --> ' + end + '\n'
1732 srt += caption + '\n\n'
1735 def _extract_subtitles(self, video_id):
1736 self.report_video_subtitles_download(video_id)
1737 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
1739 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
1740 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1741 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
1742 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
1743 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
1744 if not srt_lang_list:
1745 return (u'WARNING: video has no closed captions', None)
1746 if self._downloader.params.get('subtitleslang', False):
1747 srt_lang = self._downloader.params.get('subtitleslang')
1748 elif 'en' in srt_lang_list:
1751 srt_lang = list(srt_lang_list.keys())[0]
1752 if not srt_lang in srt_lang_list:
1753 return (u'WARNING: no closed captions found in the specified language', None)
1754 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
1756 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
1757 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1758 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
1760 return (u'WARNING: unable to download video subtitles', None)
1761 return (None, self._closed_captions_xml_to_srt(srt_xml))
1763 def _print_formats(self, formats):
1764 print('Available formats:')
1766 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
1768 def _real_initialize(self):
1769 if self._downloader is None:
1774 downloader_params = self._downloader.params
1776 # Attempt to use provided username and password or .netrc data
1777 if downloader_params.get('username', None) is not None:
1778 username = downloader_params['username']
1779 password = downloader_params['password']
1780 elif downloader_params.get('usenetrc', False):
1782 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1783 if info is not None:
1787 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1788 except (IOError, netrc.NetrcParseError) as err:
1789 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
1793 request = compat_urllib_request.Request(self._LANG_URL)
1796 compat_urllib_request.urlopen(request).read()
1797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1798 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
1801 # No authentication to be performed
1802 if username is None:
1807 'current_form': 'loginForm',
1809 'action_login': 'Log In',
1810 'username': username,
1811 'password': password,
1813 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1816 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
1817 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1818 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1820 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1821 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
1827 'action_confirm': 'Confirm',
1829 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
1831 self.report_age_confirmation()
1832 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
1833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
1837 def _extract_id(self, url):
1838 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1840 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1842 video_id = mobj.group(2)
1845 def _real_extract(self, url):
1846 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1847 mobj = re.search(self._NEXT_URL_RE, url)
1849 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1850 video_id = self._extract_id(url)
1853 self.report_video_webpage_download(video_id)
1854 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1855 request = compat_urllib_request.Request(url)
1857 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1858 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1859 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
1862 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1864 # Attempt to extract SWF player URL
1865 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1866 if mobj is not None:
1867 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1872 self.report_video_info_webpage_download(video_id)
1873 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1874 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1875 % (video_id, el_type))
1876 request = compat_urllib_request.Request(video_info_url)
1878 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
1879 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
1880 video_info = compat_parse_qs(video_info_webpage)
1881 if 'token' in video_info:
1883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1884 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1886 if 'token' not in video_info:
1887 if 'reason' in video_info:
1888 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
1890 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1893 # Check for "rental" videos
1894 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1895 self._downloader.trouble(u'ERROR: "rental" videos not supported')
1898 # Start extracting information
1899 self.report_information_extraction(video_id)
1902 if 'author' not in video_info:
1903 self._downloader.trouble(u'ERROR: unable to extract uploader name')
1905 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1908 video_uploader_id = None
1909 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
1910 if mobj is not None:
1911 video_uploader_id = mobj.group(1)
1913 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
1916 if 'title' not in video_info:
1917 self._downloader.trouble(u'ERROR: unable to extract video title')
1919 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1922 if 'thumbnail_url' not in video_info:
1923 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1924 video_thumbnail = ''
1925 else: # don't panic if we can't find it
1926 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1930 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1931 if mobj is not None:
1932 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1933 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1934 for expression in format_expressions:
1936 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1941 video_description = get_element_by_id("eow-description", video_webpage)
1942 if video_description:
1943 video_description = clean_html(video_description)
1945 video_description = ''
1948 video_subtitles = None
1949 if self._downloader.params.get('writesubtitles', False):
1950 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
1952 self._downloader.trouble(srt_error)
1954 if 'length_seconds' not in video_info:
1955 self._downloader.trouble(u'WARNING: unable to extract video duration')
1958 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1961 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
1963 # Decide which formats to download
1964 req_format = self._downloader.params.get('format', None)
1966 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1967 self.report_rtmp_download()
1968 video_url_list = [(None, video_info['conn'][0])]
1969 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1970 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1971 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
1972 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
1973 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
1975 format_limit = self._downloader.params.get('format_limit', None)
1976 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1977 if format_limit is not None and format_limit in available_formats:
1978 format_list = available_formats[available_formats.index(format_limit):]
1980 format_list = available_formats
1981 existing_formats = [x for x in format_list if x in url_map]
1982 if len(existing_formats) == 0:
1983 self._downloader.trouble(u'ERROR: no known formats available for video')
1985 if self._downloader.params.get('listformats', None):
1986 self._print_formats(existing_formats)
1988 if req_format is None or req_format == 'best':
1989 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1990 elif req_format == 'worst':
1991 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1992 elif req_format in ('-1', 'all'):
1993 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1995 # Specific formats. We pick the first in a slash-delimeted sequence.
1996 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1997 req_formats = req_format.split('/')
1998 video_url_list = None
1999 for rf in req_formats:
2001 video_url_list = [(rf, url_map[rf])]
2003 if video_url_list is None:
2004 self._downloader.trouble(u'ERROR: requested format not available')
2007 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
2011 for format_param, video_real_url in video_url_list:
2013 video_extension = self._video_extensions.get(format_param, 'flv')
2015 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
2016 self._video_dimensions.get(format_param, '???'))
2020 'url': video_real_url,
2021 'uploader': video_uploader,
2022 'uploader_id': video_uploader_id,
2023 'upload_date': upload_date,
2024 'title': video_title,
2025 'ext': video_extension,
2026 'format': video_format,
2027 'thumbnail': video_thumbnail,
2028 'description': video_description,
2029 'player_url': player_url,
2030 'subtitles': video_subtitles,
2031 'duration': video_duration
2036 class MetacafeIE(InfoExtractor):
2037 """Information Extractor for metacafe.com."""
2039 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
2040 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
2041 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
2042 IE_NAME = u'metacafe'
2044 def __init__(self, downloader=None):
2045 InfoExtractor.__init__(self, downloader)
2047 def report_disclaimer(self):
2048 """Report disclaimer retrieval."""
2049 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
2051 def report_age_confirmation(self):
2052 """Report attempt to confirm age."""
2053 self._downloader.to_screen(u'[metacafe] Confirming age')
2055 def report_download_webpage(self, video_id):
2056 """Report webpage download."""
2057 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
2059 def report_extraction(self, video_id):
2060 """Report information extraction."""
2061 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
2063 def _real_initialize(self):
2064 # Retrieve disclaimer
2065 request = compat_urllib_request.Request(self._DISCLAIMER)
2067 self.report_disclaimer()
2068 disclaimer = compat_urllib_request.urlopen(request).read()
2069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
2076 'submit': "Continue - I'm over 18",
2078 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
2080 self.report_age_confirmation()
2081 disclaimer = compat_urllib_request.urlopen(request).read()
2082 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2083 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
2086 def _real_extract(self, url):
2087 # Extract id and simplified title from URL
2088 mobj = re.match(self._VALID_URL, url)
2090 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2093 video_id = mobj.group(1)
2095 # Check if video comes from YouTube
2096 mobj2 = re.match(r'^yt-(.*)$', video_id)
2097 if mobj2 is not None:
2098 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
2101 # Retrieve video webpage to extract further information
2102 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
2104 self.report_download_webpage(video_id)
2105 webpage = compat_urllib_request.urlopen(request).read()
2106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2107 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
2110 # Extract URL, uploader and title from webpage
2111 self.report_extraction(video_id)
2112 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
2113 if mobj is not None:
2114 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
2115 video_extension = mediaURL[-3:]
2117 # Extract gdaKey if available
2118 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
2120 video_url = mediaURL
2122 gdaKey = mobj.group(1)
2123 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
2125 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
2127 self._downloader.trouble(u'ERROR: unable to extract media URL')
2129 vardict = compat_parse_qs(mobj.group(1))
2130 if 'mediaData' not in vardict:
2131 self._downloader.trouble(u'ERROR: unable to extract media URL')
2133 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
2135 self._downloader.trouble(u'ERROR: unable to extract media URL')
2137 mediaURL = mobj.group(1).replace('\\/', '/')
2138 video_extension = mediaURL[-3:]
2139 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
2141 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
2143 self._downloader.trouble(u'ERROR: unable to extract title')
2145 video_title = mobj.group(1).decode('utf-8')
2147 mobj = re.search(r'submitter=(.*?);', webpage)
2149 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2151 video_uploader = mobj.group(1)
2154 'id': video_id.decode('utf-8'),
2155 'url': video_url.decode('utf-8'),
2156 'uploader': video_uploader.decode('utf-8'),
2157 'upload_date': None,
2158 'title': video_title,
2159 'ext': video_extension.decode('utf-8'),
2163 class DailymotionIE(InfoExtractor):
2164 """Information Extractor for Dailymotion"""
2166 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
2167 IE_NAME = u'dailymotion'
2169 def __init__(self, downloader=None):
2170 InfoExtractor.__init__(self, downloader)
2172 def report_download_webpage(self, video_id):
2173 """Report webpage download."""
2174 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
2176 def report_extraction(self, video_id):
2177 """Report information extraction."""
2178 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
2180 def _real_extract(self, url):
2181 # Extract id and simplified title from URL
2182 mobj = re.match(self._VALID_URL, url)
2184 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2187 video_id = mobj.group(1).split('_')[0].split('?')[0]
2189 video_extension = 'mp4'
2191 # Retrieve video webpage to extract further information
2192 request = compat_urllib_request.Request(url)
2193 request.add_header('Cookie', 'family_filter=off')
2195 self.report_download_webpage(video_id)
2196 webpage_bytes = compat_urllib_request.urlopen(request).read()
2197 webpage = webpage_bytes.decode('utf-8')
2198 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2199 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
2202 # Extract URL, uploader and title from webpage
2203 self.report_extraction(video_id)
2204 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
2206 self._downloader.trouble(u'ERROR: unable to extract media URL')
2208 flashvars = compat_urllib_parse.unquote(mobj.group(1))
2210 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
2211 if key in flashvars:
2213 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
2216 self._downloader.trouble(u'ERROR: unable to extract video URL')
2219 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
2221 self._downloader.trouble(u'ERROR: unable to extract video URL')
2224 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
2226 # TODO: support choosing qualities
2228 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
2230 self._downloader.trouble(u'ERROR: unable to extract title')
2232 video_title = unescapeHTML(mobj.group('title'))
2234 video_uploader = None
2235 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
2237 # lookin for official user
2238 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
2239 if mobj_official is None:
2240 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
2242 video_uploader = mobj_official.group(1)
2244 video_uploader = mobj.group(1)
2246 video_upload_date = None
2247 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
2248 if mobj is not None:
2249 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
2254 'uploader': video_uploader,
2255 'upload_date': video_upload_date,
2256 'title': video_title,
2257 'ext': video_extension,
2261 class PhotobucketIE(InfoExtractor):
2262 """Information extractor for photobucket.com."""
2264 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
2265 IE_NAME = u'photobucket'
2267 def __init__(self, downloader=None):
2268 InfoExtractor.__init__(self, downloader)
2270 def report_download_webpage(self, video_id):
2271 """Report webpage download."""
2272 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
2274 def report_extraction(self, video_id):
2275 """Report information extraction."""
2276 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
2278 def _real_extract(self, url):
2279 # Extract id from URL
2280 mobj = re.match(self._VALID_URL, url)
2282 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2285 video_id = mobj.group(1)
2287 video_extension = 'flv'
2289 # Retrieve video webpage to extract further information
2290 request = compat_urllib_request.Request(url)
2292 self.report_download_webpage(video_id)
2293 webpage = compat_urllib_request.urlopen(request).read()
2294 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2295 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2298 # Extract URL, uploader, and title from webpage
2299 self.report_extraction(video_id)
2300 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
2302 self._downloader.trouble(u'ERROR: unable to extract media URL')
2304 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
2306 video_url = mediaURL
2308 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
2310 self._downloader.trouble(u'ERROR: unable to extract title')
2312 video_title = mobj.group(1).decode('utf-8')
2314 video_uploader = mobj.group(2).decode('utf-8')
2317 'id': video_id.decode('utf-8'),
2318 'url': video_url.decode('utf-8'),
2319 'uploader': video_uploader,
2320 'upload_date': None,
2321 'title': video_title,
2322 'ext': video_extension.decode('utf-8'),
2326 class YahooIE(InfoExtractor):
2327 """Information extractor for video.yahoo.com."""
2330 # _VALID_URL matches all Yahoo! Video URLs
2331 # _VPAGE_URL matches only the extractable '/watch/' URLs
2332 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
2333 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
2334 IE_NAME = u'video.yahoo'
2336 def __init__(self, downloader=None):
2337 InfoExtractor.__init__(self, downloader)
2339 def report_download_webpage(self, video_id):
2340 """Report webpage download."""
2341 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
2343 def report_extraction(self, video_id):
2344 """Report information extraction."""
2345 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
2347 def _real_extract(self, url, new_video=True):
2348 # Extract ID from URL
2349 mobj = re.match(self._VALID_URL, url)
2351 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2354 video_id = mobj.group(2)
2355 video_extension = 'flv'
2357 # Rewrite valid but non-extractable URLs as
2358 # extractable English language /watch/ URLs
2359 if re.match(self._VPAGE_URL, url) is None:
2360 request = compat_urllib_request.Request(url)
2362 webpage = compat_urllib_request.urlopen(request).read()
2363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2364 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2367 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
2369 self._downloader.trouble(u'ERROR: Unable to extract id field')
2371 yahoo_id = mobj.group(1)
2373 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
2375 self._downloader.trouble(u'ERROR: Unable to extract vid field')
2377 yahoo_vid = mobj.group(1)
2379 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
2380 return self._real_extract(url, new_video=False)
2382 # Retrieve video webpage to extract further information
2383 request = compat_urllib_request.Request(url)
2385 self.report_download_webpage(video_id)
2386 webpage = compat_urllib_request.urlopen(request).read()
2387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2388 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2391 # Extract uploader and title from webpage
2392 self.report_extraction(video_id)
2393 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
2395 self._downloader.trouble(u'ERROR: unable to extract video title')
2397 video_title = mobj.group(1).decode('utf-8')
2399 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
2401 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2403 video_uploader = mobj.group(1).decode('utf-8')
2405 # Extract video thumbnail
2406 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
2408 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2410 video_thumbnail = mobj.group(1).decode('utf-8')
2412 # Extract video description
2413 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
2415 self._downloader.trouble(u'ERROR: unable to extract video description')
2417 video_description = mobj.group(1).decode('utf-8')
2418 if not video_description:
2419 video_description = 'No description available.'
2421 # Extract video height and width
2422 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
2424 self._downloader.trouble(u'ERROR: unable to extract video height')
2426 yv_video_height = mobj.group(1)
2428 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
2430 self._downloader.trouble(u'ERROR: unable to extract video width')
2432 yv_video_width = mobj.group(1)
2434 # Retrieve video playlist to extract media URL
2435 # I'm not completely sure what all these options are, but we
2436 # seem to need most of them, otherwise the server sends a 401.
2437 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
2438 yv_bitrate = '700' # according to Wikipedia this is hard-coded
2439 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
2440 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
2441 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
2443 self.report_download_webpage(video_id)
2444 webpage = compat_urllib_request.urlopen(request).read()
2445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2449 # Extract media URL from playlist XML
2450 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2452 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2454 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2455 video_url = unescapeHTML(video_url)
2458 'id': video_id.decode('utf-8'),
2460 'uploader': video_uploader,
2461 'upload_date': None,
2462 'title': video_title,
2463 'ext': video_extension.decode('utf-8'),
2464 'thumbnail': video_thumbnail.decode('utf-8'),
2465 'description': video_description,
2469 class VimeoIE(InfoExtractor):
2470 """Information extractor for vimeo.com."""
2472 # _VALID_URL matches Vimeo URLs
2473 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
2476 def __init__(self, downloader=None):
2477 InfoExtractor.__init__(self, downloader)
2479 def report_download_webpage(self, video_id):
2480 """Report webpage download."""
2481 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2483 def report_extraction(self, video_id):
2484 """Report information extraction."""
2485 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2487 def _real_extract(self, url, new_video=True):
2488 # Extract ID from URL
2489 mobj = re.match(self._VALID_URL, url)
2491 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2494 video_id = mobj.group(1)
2496 # Retrieve video webpage to extract further information
2497 request = compat_urllib_request.Request(url, None, std_headers)
2499 self.report_download_webpage(video_id)
2500 webpage_bytes = compat_urllib_request.urlopen(request).read()
2501 webpage = webpage_bytes.decode('utf-8')
2502 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2503 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2506 # Now we begin extracting as much information as we can from what we
2507 # retrieved. First we extract the information common to all extractors,
2508 # and latter we extract those that are Vimeo specific.
2509 self.report_extraction(video_id)
2511 # Extract the config JSON
2513 config = webpage.split(' = {config:')[1].split(',assets:')[0]
2514 config = json.loads(config)
2516 self._downloader.trouble(u'ERROR: unable to extract info section')
2520 video_title = config["video"]["title"]
2522 # Extract uploader and uploader_id
2523 video_uploader = config["video"]["owner"]["name"]
2524 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
2526 # Extract video thumbnail
2527 video_thumbnail = config["video"]["thumbnail"]
2529 # Extract video description
2530 video_description = get_element_by_attribute("itemprop", "description", webpage)
2531 if video_description: video_description = clean_html(video_description)
2532 else: video_description = ''
2534 # Extract upload date
2535 video_upload_date = None
2536 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
2537 if mobj is not None:
2538 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
2540 # Vimeo specific: extract request signature and timestamp
2541 sig = config['request']['signature']
2542 timestamp = config['request']['timestamp']
2544 # Vimeo specific: extract video codec and quality information
2545 # First consider quality, then codecs, then take everything
2546 # TODO bind to format param
2547 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
2548 files = { 'hd': [], 'sd': [], 'other': []}
2549 for codec_name, codec_extension in codecs:
2550 if codec_name in config["video"]["files"]:
2551 if 'hd' in config["video"]["files"][codec_name]:
2552 files['hd'].append((codec_name, codec_extension, 'hd'))
2553 elif 'sd' in config["video"]["files"][codec_name]:
2554 files['sd'].append((codec_name, codec_extension, 'sd'))
2556 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
2558 for quality in ('hd', 'sd', 'other'):
2559 if len(files[quality]) > 0:
2560 video_quality = files[quality][0][2]
2561 video_codec = files[quality][0][0]
2562 video_extension = files[quality][0][1]
2563 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
2566 self._downloader.trouble(u'ERROR: no known codec found')
2569 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
2570 %(video_id, sig, timestamp, video_quality, video_codec.upper())
2575 'uploader': video_uploader,
2576 'uploader_id': video_uploader_id,
2577 'upload_date': video_upload_date,
2578 'title': video_title,
2579 'ext': video_extension,
2580 'thumbnail': video_thumbnail,
2581 'description': video_description,
2585 class ArteTvIE(InfoExtractor):
2586 """arte.tv information extractor."""
2588 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
2589 _LIVE_URL = r'index-[0-9]+\.html$'
2591 IE_NAME = u'arte.tv'
2593 def __init__(self, downloader=None):
2594 InfoExtractor.__init__(self, downloader)
2596 def report_download_webpage(self, video_id):
2597 """Report webpage download."""
2598 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
2600 def report_extraction(self, video_id):
2601 """Report information extraction."""
2602 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
2604 def fetch_webpage(self, url):
2605 self._downloader.incre#!/usr/bin/env python
2606 # -*- coding: utf-8 -*-
2608 from __future__ import absolute_import
2617 import xml.etree.ElementTree
2621 from .utils import *
2624 class InfoExtractor(object):
2625 """Information Extractor class.
2627 Information extractors are the classes that, given a URL, extract
2628 information about the video (or videos) the URL refers to. This
2629 information includes the real video URL, the video title, author and
2630 others. The information is stored in a dictionary which is then
2631 passed to the FileDownloader. The FileDownloader processes this
2632 information possibly downloading the video to the file system, among
2633 other possible outcomes.
2635 The dictionaries must include the following fields:
2637 id: Video identifier.
2638 url: Final video URL.
2639 title: Video title, unescaped.
2640 ext: Video filename extension.
2641 uploader: Full name of the video uploader.
2642 upload_date: Video upload date (YYYYMMDD).
2644 The following fields are optional:
2646 format: The video format, defaults to ext (used for --get-format)
2647 thumbnail: Full URL to a video thumbnail image.
2648 description: One-line video description.
2649 uploader_id: Nickname or id of the video uploader.
2650 player_url: SWF Player URL (used for rtmpdump).
2651 subtitles: The .srt file contents.
2652 urlhandle: [internal] The urlHandle to be used to download the file,
2653 like returned by urllib.request.urlopen
2655 The fields should all be Unicode strings.
2657 Subclasses of this one should re-define the _real_initialize() and
2658 _real_extract() methods and define a _VALID_URL regexp.
2659 Probably, they should also be added to the list of extractors.
2661 _real_extract() must return a *list* of information dictionaries as
2664 Finally, the _WORKING attribute should be set to False for broken IEs
2665 in order to warn the users and skip the tests.
2672 def __init__(self, downloader=None):
2673 """Constructor. Receives an optional downloader."""
2675 self.set_downloader(downloader)
2677 def suitable(self, url):
2678 """Receives a URL and returns True if suitable for this IE."""
2679 return re.match(self._VALID_URL, url) is not None
2682 """Getter method for _WORKING."""
2683 return self._WORKING
2685 def initialize(self):
2686 """Initializes an instance (authentication, etc)."""
2688 self._real_initialize()
2691 def extract(self, url):
2692 """Extracts URL information and returns it in list of dicts."""
2694 return self._real_extract(url)
2696 def set_downloader(self, downloader):
2697 """Sets the downloader for this IE."""
2698 self._downloader = downloader
2700 def _real_initialize(self):
2701 """Real initialization process. Redefine in subclasses."""
2704 def _real_extract(self, url):
2705 """Real extraction process. Redefine in subclasses."""
2710 return type(self).__name__[:-2]
2712 class YoutubeIE(InfoExtractor):
2713 """Information extractor for youtube.com."""
2717 (?:https?://)? # http(s):// (optional)
2718 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
2719 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
2720 (?:.*?\#/)? # handle anchor (#/) redirect urls
2721 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
2722 (?: # the various things that can precede the ID:
2723 (?:(?:v|embed|e)/) # v/ or embed/ or e/
2724 |(?: # or the v= param in all its forms
2725 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
2726 (?:\?|\#!?) # the params delimiter ? or # or #!
2727 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
2730 )? # optional -> youtube.com/xxxx is OK
2731 )? # all until now is optional -> you can pass the naked ID
2732 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
2733 (?(1).+)? # if we found the ID, everything can follow
2735 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
2736 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
2737 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
2738 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
2739 _NETRC_MACHINE = 'youtube'
2740 # Listed in order of quality
2741 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
2742 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
2743 _video_extensions = {
2749 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
2755 _video_dimensions = {
2771 IE_NAME = u'youtube'
2773 def suitable(self, url):
2774 """Receives a URL and returns True if suitable for this IE."""
2775 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2777 def report_lang(self):
2778 """Report attempt to set language."""
2779 self._downloader.to_screen(u'[youtube] Setting language')
2781 def report_login(self):
2782 """Report attempt to log in."""
2783 self._downloader.to_screen(u'[youtube] Logging in')
2785 def report_age_confirmation(self):
2786 """Report attempt to confirm age."""
2787 self._downloader.to_screen(u'[youtube] Confirming age')
2789 def report_video_webpage_download(self, video_id):
2790 """Report attempt to download video webpage."""
2791 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
2793 def report_video_info_webpage_download(self, video_id):
2794 """Report attempt to download video info webpage."""
2795 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
2797 def report_video_subtitles_download(self, video_id):
2798 """Report attempt to download video info webpage."""
2799 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
2801 def report_information_extraction(self, video_id):
2802 """Report attempt to extract video information."""
2803 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
2805 def report_unavailable_format(self, video_id, format):
2806 """Report extracted video URL."""
2807 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
2809 def report_rtmp_download(self):
2810 """Indicate the download will use the RTMP protocol."""
2811 self._downloader.to_screen(u'[youtube] RTMP download detected')
2813 def _closed_captions_xml_to_srt(self, xml_string):
2815 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
2816 # TODO parse xml instead of regex
2817 for n, (start, dur_tag, dur, caption) in enumerate(texts):
2818 if not dur: dur = '4'
2819 start = float(start)
2820 end = start + float(dur)
2821 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
2822 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
2823 caption = unescapeHTML(caption)
2824 caption = unescapeHTML(caption) # double cycle, intentional
2825 srt += str(n+1) + '\n'
2826 srt += start + ' --> ' + end + '\n'
2827 srt += caption + '\n\n'
2830 def _extract_subtitles(self, video_id):
2831 self.report_video_subtitles_download(video_id)
2832 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
2834 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
2835 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2836 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
2837 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
2838 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
2839 if not srt_lang_list:
2840 return (u'WARNING: video has no closed captions', None)
2841 if self._downloader.params.get('subtitleslang', False):
2842 srt_lang = self._downloader.params.get('subtitleslang')
2843 elif 'en' in srt_lang_list:
2846 srt_lang = list(srt_lang_list.keys())[0]
2847 if not srt_lang in srt_lang_list:
2848 return (u'WARNING: no closed captions found in the specified language', None)
2849 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
2851 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
2852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
2855 return (u'WARNING: unable to download video subtitles', None)
2856 return (None, self._closed_captions_xml_to_srt(srt_xml))
2858 def _print_formats(self, formats):
2859 print('Available formats:')
2861 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
2863 def _real_initialize(self):
2864 if self._downloader is None:
2869 downloader_params = self._downloader.params
2871 # Attempt to use provided username and password or .netrc data
2872 if downloader_params.get('username', None) is not None:
2873 username = downloader_params['username']
2874 password = downloader_params['password']
2875 elif downloader_params.get('usenetrc', False):
2877 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2878 if info is not None:
2882 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2883 except (IOError, netrc.NetrcParseError) as err:
2884 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2888 request = compat_urllib_request.Request(self._LANG_URL)
2891 compat_urllib_request.urlopen(request).read()
2892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
2896 # No authentication to be performed
2897 if username is None:
2902 'current_form': 'loginForm',
2904 'action_login': 'Log In',
2905 'username': username,
2906 'password': password,
2908 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2911 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
2912 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
2913 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
2915 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2916 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2922 'action_confirm': 'Confirm',
2924 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
2926 self.report_age_confirmation()
2927 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
2928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2929 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
2932 def _extract_id(self, url):
2933 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2935 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2937 video_id = mobj.group(2)
2940 def _real_extract(self, url):
2941 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
2942 mobj = re.search(self._NEXT_URL_RE, url)
2944 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
2945 video_id = self._extract_id(url)
2948 self.report_video_webpage_download(video_id)
2949 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
2950 request = compat_urllib_request.Request(url)
2952 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
2953 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2954 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2957 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
2959 # Attempt to extract SWF player URL
2960 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
2961 if mobj is not None:
2962 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
2967 self.report_video_info_webpage_download(video_id)
2968 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
2969 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
2970 % (video_id, el_type))
2971 request = compat_urllib_request.Request(video_info_url)
2973 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
2974 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
2975 video_info = compat_parse_qs(video_info_webpage)
2976 if 'token' in video_info:
2978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2979 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2981 if 'token' not in video_info:
2982 if 'reason' in video_info:
2983 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
2985 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
2988 # Check for "rental" videos
2989 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
2990 self._downloader.trouble(u'ERROR: "rental" videos not supported')
2993 # Start extracting information
2994 self.report_information_extraction(video_id)
2997 if 'author' not in video_info:
2998 self._downloader.trouble(u'ERROR: unable to extract uploader name')
3000 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
3003 video_uploader_id = None
3004 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
3005 if mobj is not None:
3006 video_uploader_id = mobj.group(1)
3008 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
3011 if 'title' not in video_info:
3012 self._downloader.trouble(u'ERROR: unable to extract video title')
3014 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
3017 if 'thumbnail_url' not in video_info:
3018 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
3019 video_thumbnail = ''
3020 else: # don't panic if we can't find it
3021 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
3025 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
3026 if mobj is not None:
3027 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
3028 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
3029 for expression in format_expressions:
3031 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
3036 video_description = get_element_by_id("eow-description", video_webpage)
3037 if video_description:
3038 video_description = clean_html(video_description)
3040 video_description = ''
3043 video_subtitles = None
3044 if self._downloader.params.get('writesubtitles', False):
3045 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
3047 self._downloader.trouble(srt_error)
3049 if 'length_seconds' not in video_info:
3050 self._downloader.trouble(u'WARNING: unable to extract video duration')
3053 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
3056 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
3058 # Decide which formats to download
3059 req_format = self._downloader.params.get('format', None)
3061 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
3062 self.report_rtmp_download()
3063 video_url_list = [(None, video_info['conn'][0])]
3064 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
3065 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
3066 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
3067 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
3068 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
3070 format_limit = self._downloader.params.get('format_limit', None)
3071 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
3072 if format_limit is not None and format_limit in available_formats:
3073 format_list = available_formats[available_formats.index(format_limit):]
3075 format_list = available_formats
3076 existing_formats = [x for x in format_list if x in url_map]
3077 if len(existing_formats) == 0:
3078 self._downloader.trouble(u'ERROR: no known formats available for video')
3080 if self._downloader.params.get('listformats', None):
3081 self._print_formats(existing_formats)
3083 if req_format is None or req_format == 'best':
3084 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
3085 elif req_format == 'worst':
3086 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
3087 elif req_format in ('-1', 'all'):
3088 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
3090 # Specific formats. We pick the first in a slash-delimeted sequence.
3091 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
3092 req_formats = req_format.split('/')
3093 video_url_list = None
3094 for rf in req_formats:
3096 video_url_list = [(rf, url_map[rf])]
3098 if video_url_list is None:
3099 self._downloader.trouble(u'ERROR: requested format not available')
3102 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
3106 for format_param, video_real_url in video_url_list:
3108 video_extension = self._video_extensions.get(format_param, 'flv')
3110 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
3111 self._video_dimensions.get(format_param, '???'))
3115 'url': video_real_url,
3116 'uploader': video_uploader,
3117 'uploader_id': video_uploader_id,
3118 'upload_date': upload_date,
3119 'title': video_title,
3120 'ext': video_extension,
3121 'format': video_format,
3122 'thumbnail': video_thumbnail,
3123 'description': video_description,
3124 'player_url': player_url,
3125 'subtitles': video_subtitles,
3126 'duration': video_duration
3131 class MetacafeIE(InfoExtractor):
3132 """Information Extractor for metacafe.com."""
3134 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
3135 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
3136 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
3137 IE_NAME = u'metacafe'
3139 def __init__(self, downloader=None):
3140 InfoExtractor.__init__(self, downloader)
3142 def report_disclaimer(self):
3143 """Report disclaimer retrieval."""
3144 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
3146 def report_age_confirmation(self):
3147 """Report attempt to confirm age."""
3148 self._downloader.to_screen(u'[metacafe] Confirming age')
3150 def report_download_webpage(self, video_id):
3151 """Report webpage download."""
3152 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
3154 def report_extraction(self, video_id):
3155 """Report information extraction."""
3156 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
3158 def _real_initialize(self):
3159 # Retrieve disclaimer
3160 request = compat_urllib_request.Request(self._DISCLAIMER)
3162 self.report_disclaimer()
3163 disclaimer = compat_urllib_request.urlopen(request).read()
3164 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3165 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
3171 'submit': "Continue - I'm over 18",
3173 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
3175 self.report_age_confirmation()
3176 disclaimer = compat_urllib_request.urlopen(request).read()
3177 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3178 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
3181 def _real_extract(self, url):
3182 # Extract id and simplified title from URL
3183 mobj = re.match(self._VALID_URL, url)
3185 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3188 video_id = mobj.group(1)
3190 # Check if video comes from YouTube
3191 mobj2 = re.match(r'^yt-(.*)$', video_id)
3192 if mobj2 is not None:
3193 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
3196 # Retrieve video webpage to extract further information
3197 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
3199 self.report_download_webpage(video_id)
3200 webpage = compat_urllib_request.urlopen(request).read()
3201 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3202 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
3205 # Extract URL, uploader and title from webpage
3206 self.report_extraction(video_id)
3207 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
3208 if mobj is not None:
3209 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
3210 video_extension = mediaURL[-3:]
3212 # Extract gdaKey if available
3213 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
3215 video_url = mediaURL
3217 gdaKey = mobj.group(1)
3218 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
3220 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
3222 self._downloader.trouble(u'ERROR: unable to extract media URL')
3224 vardict = compat_parse_qs(mobj.group(1))
3225 if 'mediaData' not in vardict:
3226 self._downloader.trouble(u'ERROR: unable to extract media URL')
3228 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
3230 self._downloader.trouble(u'ERROR: unable to extract media URL')
3232 mediaURL = mobj.group(1).replace('\\/', '/')
3233 video_extension = mediaURL[-3:]
3234 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
3236 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
3238 self._downloader.trouble(u'ERROR: unable to extract title')
3240 video_title = mobj.group(1).decode('utf-8')
3242 mobj = re.search(r'submitter=(.*?);', webpage)
3244 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
3246 video_uploader = mobj.group(1)
3249 'id': video_id.decode('utf-8'),
3250 'url': video_url.decode('utf-8'),
3251 'uploader': video_uploader.decode('utf-8'),
3252 'upload_date': None,
3253 'title': video_title,
3254 'ext': video_extension.decode('utf-8'),
3258 class DailymotionIE(InfoExtractor):
3259 """Information Extractor for Dailymotion"""
3261 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
3262 IE_NAME = u'dailymotion'
3264 def __init__(self, downloader=None):
3265 InfoExtractor.__init__(self, downloader)
3267 def report_download_webpage(self, video_id):
3268 """Report webpage download."""
3269 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
3271 def report_extraction(self, video_id):
3272 """Report information extraction."""
3273 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
3275 def _real_extract(self, url):
3276 # Extract id and simplified title from URL
3277 mobj = re.match(self._VALID_URL, url)
3279 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3282 video_id = mobj.group(1).split('_')[0].split('?')[0]
3284 video_extension = 'mp4'
3286 # Retrieve video webpage to extract further information
3287 request = compat_urllib_request.Request(url)
3288 request.add_header('Cookie', 'family_filter=off')
3290 self.report_download_webpage(video_id)
3291 webpage_bytes = compat_urllib_request.urlopen(request).read()
3292 webpage = webpage_bytes.decode('utf-8')
3293 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3294 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
3297 # Extract URL, uploader and title from webpage
3298 self.report_extraction(video_id)
3299 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
3301 self._downloader.trouble(u'ERROR: unable to extract media URL')
3303 flashvars = compat_urllib_parse.unquote(mobj.group(1))
3305 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
3306 if key in flashvars:
3308 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
3311 self._downloader.trouble(u'ERROR: unable to extract video URL')
3314 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
3316 self._downloader.trouble(u'ERROR: unable to extract video URL')
3319 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
3321 # TODO: support choosing qualities
3323 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
3325 self._downloader.trouble(u'ERROR: unable to extract title')
3327 video_title = unescapeHTML(mobj.group('title'))
3329 video_uploader = None
3330 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
3332 # lookin for official user
3333 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
3334 if mobj_official is None:
3335 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
3337 video_uploader = mobj_official.group(1)
3339 video_uploader = mobj.group(1)
3341 video_upload_date = None
3342 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
3343 if mobj is not None:
3344 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
3349 'uploader': video_uploader,
3350 'upload_date': video_upload_date,
3351 'title': video_title,
3352 'ext': video_extension,
3356 class PhotobucketIE(InfoExtractor):
3357 """Information extractor for photobucket.com."""
3359 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
3360 IE_NAME = u'photobucket'
3362 def __init__(self, downloader=None):
3363 InfoExtractor.__init__(self, downloader)
3365 def report_download_webpage(self, video_id):
3366 """Report webpage download."""
3367 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
3369 def report_extraction(self, video_id):
3370 """Report information extraction."""
3371 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
3373 def _real_extract(self, url):
3374 # Extract id from URL
3375 mobj = re.match(self._VALID_URL, url)
3377 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3380 video_id = mobj.group(1)
3382 video_extension = 'flv'
3384 # Retrieve video webpage to extract further information
3385 request = compat_urllib_request.Request(url)
3387 self.report_download_webpage(video_id)
3388 webpage = compat_urllib_request.urlopen(request).read()
3389 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3390 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3393 # Extract URL, uploader, and title from webpage
3394 self.report_extraction(video_id)
3395 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
3397 self._downloader.trouble(u'ERROR: unable to extract media URL')
3399 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
3401 video_url = mediaURL
3403 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
3405 self._downloader.trouble(u'ERROR: unable to extract title')
3407 video_title = mobj.group(1).decode('utf-8')
3409 video_uploader = mobj.group(2).decode('utf-8')
3412 'id': video_id.decode('utf-8'),
3413 'url': video_url.decode('utf-8'),
3414 'uploader': video_uploader,
3415 'upload_date': None,
3416 'title': video_title,
3417 'ext': video_extension.decode('utf-8'),
3421 class YahooIE(InfoExtractor):
3422 """Information extractor for video.yahoo.com."""
3425 # _VALID_URL matches all Yahoo! Video URLs
3426 # _VPAGE_URL matches only the extractable '/watch/' URLs
3427 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
3428 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
3429 IE_NAME = u'video.yahoo'
3431 def __init__(self, downloader=None):
3432 InfoExtractor.__init__(self, downloader)
3434 def report_download_webpage(self, video_id):
3435 """Report webpage download."""
3436 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
3438 def report_extraction(self, video_id):
3439 """Report information extraction."""
3440 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
3442 def _real_extract(self, url, new_video=True):
3443 # Extract ID from URL
3444 mobj = re.match(self._VALID_URL, url)
3446 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3449 video_id = mobj.group(2)
3450 video_extension = 'flv'
3452 # Rewrite valid but non-extractable URLs as
3453 # extractable English language /watch/ URLs
3454 if re.match(self._VPAGE_URL, url) is None:
3455 request = compat_urllib_request.Request(url)
3457 webpage = compat_urllib_request.urlopen(request).read()
3458 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3459 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3462 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
3464 self._downloader.trouble(u'ERROR: Unable to extract id field')
3466 yahoo_id = mobj.group(1)
3468 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
3470 self._downloader.trouble(u'ERROR: Unable to extract vid field')
3472 yahoo_vid = mobj.group(1)
3474 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
3475 return self._real_extract(url, new_video=False)
3477 # Retrieve video webpage to extract further information
3478 request = compat_urllib_request.Request(url)
3480 self.report_download_webpage(video_id)
3481 webpage = compat_urllib_request.urlopen(request).read()
3482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3483 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3486 # Extract uploader and title from webpage
3487 self.report_extraction(video_id)
3488 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
3490 self._downloader.trouble(u'ERROR: unable to extract video title')
3492 video_title = mobj.group(1).decode('utf-8')
3494 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
3496 self._downloader.trouble(u'ERROR: unable to extract video uploader')
3498 video_uploader = mobj.group(1).decode('utf-8')
3500 # Extract video thumbnail
3501 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
3503 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3505 video_thumbnail = mobj.group(1).decode('utf-8')
3507 # Extract video description
3508 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
3510 self._downloader.trouble(u'ERROR: unable to extract video description')
3512 video_description = mobj.group(1).decode('utf-8')
3513 if not video_description:
3514 video_description = 'No description available.'
3516 # Extract video height and width
3517 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
3519 self._downloader.trouble(u'ERROR: unable to extract video height')
3521 yv_video_height = mobj.group(1)
3523 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
3525 self._downloader.trouble(u'ERROR: unable to extract video width')
3527 yv_video_width = mobj.group(1)
3529 # Retrieve video playlist to extract media URL
3530 # I'm not completely sure what all these options are, but we
3531 # seem to need most of them, otherwise the server sends a 401.
3532 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
3533 yv_bitrate = '700' # according to Wikipedia this is hard-coded
3534 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
3535 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
3536 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
3538 self.report_download_webpage(video_id)
3539 webpage = compat_urllib_request.urlopen(request).read()
3540 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3541 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3544 # Extract media URL from playlist XML
3545 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
3547 self._downloader.trouble(u'ERROR: Unable to extract media URL')
3549 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
3550 video_url = unescapeHTML(video_url)
3553 'id': video_id.decode('utf-8'),
3555 'uploader': video_uploader,
3556 'upload_date': None,
3557 'title': video_title,
3558 'ext': video_extension.decode('utf-8'),
3559 'thumbnail': video_thumbnail.decode('utf-8'),
3560 'description': video_description,
3564 class VimeoIE(InfoExtractor):
3565 """Information extractor for vimeo.com."""
3567 # _VALID_URL matches Vimeo URLs
3568 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
3571 def __init__(self, downloader=None):
3572 InfoExtractor.__init__(self, downloader)
3574 def report_download_webpage(self, video_id):
3575 """Report webpage download."""
3576 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
3578 def report_extraction(self, video_id):
3579 """Report information extraction."""
3580 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
3582 def _real_extract(self, url, new_video=True):
3583 # Extract ID from URL
3584 mobj = re.match(self._VALID_URL, url)
3586 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3589 video_id = mobj.group(1)
3591 # Retrieve video webpage to extract further information
3592 request = compat_urllib_request.Request(url, None, std_headers)
3594 self.report_download_webpage(video_id)
3595 webpage_bytes = compat_urllib_request.urlopen(request).read()
3596 webpage = webpage_bytes.decode('utf-8')
3597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3598 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3601 # Now we begin extracting as much information as we can from what we
3602 # retrieved. First we extract the information common to all extractors,
3603 # and latter we extract those that are Vimeo specific.
3604 self.report_extraction(video_id)
3606 # Extract the config JSON
3608 config = webpage.split(' = {config:')[1].split(',assets:')[0]
3609 config = json.loads(config)
3611 self._downloader.trouble(u'ERROR: unable to extract info section')
3615 video_title = config["video"]["title"]
3617 # Extract uploader and uploader_id
3618 video_uploader = config["video"]["owner"]["name"]
3619 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
3621 # Extract video thumbnail
3622 video_thumbnail = config["video"]["thumbnail"]
3624 # Extract video description
3625 video_description = get_element_by_attribute("itemprop", "description", webpage)
3626 if video_description: video_description = clean_html(video_description)
3627 else: video_description = ''
3629 # Extract upload date
3630 video_upload_date = None
3631 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
3632 if mobj is not None:
3633 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
3635 # Vimeo specific: extract request signature and timestamp
3636 sig = config['request']['signature']
3637 timestamp = config['request']['timestamp']
3639 # Vimeo specific: extract video codec and quality information
3640 # First consider quality, then codecs, then take everything
3641 # TODO bind to format param
3642 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
3643 files = { 'hd': [], 'sd': [], 'other': []}
3644 for codec_name, codec_extension in codecs:
3645 if codec_name in config["video"]["files"]:
3646 if 'hd' in config["video"]["files"][codec_name]:
3647 files['hd'].append((codec_name, codec_extension, 'hd'))
3648 elif 'sd' in config["video"]["files"][codec_name]:
3649 files['sd'].append((codec_name, codec_extension, 'sd'))
3651 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
3653 for quality in ('hd', 'sd', 'other'):
3654 if len(files[quality]) > 0:
3655 video_quality = files[quality][0][2]
3656 video_codec = files[quality][0][0]
3657 video_extension = files[quality][0][1]
3658 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
3661 self._downloader.trouble(u'ERROR: no known codec found')
3664 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
3665 %(video_id, sig, timestamp, video_quality, video_codec.upper())
3670 'uploader': video_uploader,
3671 'uploader_id': video_uploader_id,
3672 'upload_date': video_upload_date,
3673 'title': video_title,
3674 'ext': video_extension,
3675 'thumbnail': video_thumbnail,
3676 'description': video_description,
3680 class ArteTvIE(InfoExtractor):
3681 """arte.tv information extractor."""
3683 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
3684 _LIVE_URL = r'index-[0-9]+\.html$'
3686 IE_NAME = u'arte.tv'
3688 def __init__(self, downloader=None):
3689 InfoExtractor.__init__(self, downloader)
3691 def report_download_webpage(self, video_id):
3692 """Report webpage download."""
3693 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
3695 def report_extraction(self, video_id):
3696 """Report information extraction."""
3697 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
3699 def fetch_webpage(self, url):
3700 self._downloader.increment_downloads()
3701 request = compat_urllib_request.Request(url)
3703 self.report_download_webpage(url)
3704 webpage = compat_urllib_request.urlopen(request).read()
3705 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3706 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3708 except ValueError as err:
3709 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3713 def grep_webpage(self, url, regex, regexFlags, matchTuples):
3714 page = self.fetch_webpage(url)
3715 mobj = re.search(regex, page, regexFlags)
3719 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3722 for (i, key, err) in matchTuples:
3723 if mobj.group(i) is None:
3724 self._downloader.trouble(err)
3727 info[key] = mobj.group(i)
3731 def extractLiveStream(self, url):
3732 video_lang = url.split('/')[-4]
3733 info = self.grep_webpage(
3735 r'src="(.*?/videothek_js.*?\.js)',
3738 (1, 'url', u'ERROR: Invalid URL: %s' % url)
3741 http_host = url.split('/')[2]
3742 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
3743 info = self.grep_webpage(
3745 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
3746 '(http://.*?\.swf).*?' +
3750 (1, 'path', u'ERROR: could not extract video path: %s' % url),
3751 (2, 'player', u'ERROR: could not extract video player: %s' % url),
3752 (3, 'url', u'ERROR: could not extract video url: %s' % url)
3755 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
3757 def extractPlus7Stream(self, url):
3758 video_lang = url.split('/')[-3]
3759 info = self.grep_webpage(
3761 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
3764 (1, 'url', u'ERROR: Invalid URL: %s' % url)
3767 next_url = compat_urllib_parse.unquote(info.get('url'))
3768 info = self.grep_webpage(
3770 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
3773 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
3776 next_url = compat_urllib_parse.unquote(info.get('url'))
3778 info = self.grep_webpage(
3780 r'<video id="(.*?)".*?>.*?' +
3781 '<name>(.*?)</name>.*?' +
3782 '<dateVideo>(.*?)</dateVideo>.*?' +
3783 '<url quality="hd">(.*?)</url>',
3786 (1, 'id', u'ERROR: could not extract video id: %s' % url),
3787 (2, 'title', u'ERROR: could not extract video title: %s' % url),
3788 (3, 'date', u'ERROR: could not extract video date: %s' % url),
3789 (4, 'url', u'ERROR: could not extract video url: %s' % url)
3794 'id': info.get('id'),
3795 'url': compat_urllib_parse.unquote(info.get('url')),
3796 'uploader': u'arte.tv',
3797 'upload_date': info.get('date'),
3798 'title': info.get('title').decode('utf-8'),
3804 def _real_extract(self, url):
3805 video_id = url.split('/')[-1]
3806 self.report_extraction(video_id)
3808 if re.search(self._LIVE_URL, video_id) is not None:
3809 self.extractLiveStream(url)
3812 info = self.extractPlus7Stream(url)
3817 class GenericIE(InfoExtractor):
3818 """Generic last-resort information extractor."""
3821 IE_NAME = u'generic'
3823 def __init__(self, downloader=None):
3824 InfoExtractor.__init__(self, downloader)
3826 def report_download_webpage(self, video_id):
3827 """Report webpage download."""
3828 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
3829 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
3831 def report_extraction(self, video_id):
3832 """Report information extraction."""
3833 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
3835 def report_following_redirect(self, new_url):
3836 """Report information extraction."""
3837 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
3839 def _test_redirect(self, url):
3840 """Check if it is a redirect, like url shorteners, in case restart chain."""
3841 class HeadRequest(compat_urllib_request.Request):
3842 def get_method(self):
3845 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
3847 Subclass the HTTPRedirectHandler to make it use our
3848 HeadRequest also on the redirected URL
3850 def redirect_request(self, req, fp, code, msg, headers, newurl):
3851 if code in (301, 302, 303, 307):
3852 newurl = newurl.replace(' ', '%20')
3853 newheaders = dict((k,v) for k,v in req.headers.items()
3854 if k.lower() not in ("content-length", "content-type"))
3855 return HeadRequest(newurl,
3857 origin_req_host=req.get_origin_req_host(),
3860 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
3862 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
3864 Fallback to GET if HEAD is not allowed (405 HTTP error)
3866 def http_error_405(self, req, fp, code, msg, headers):
3870 newheaders = dict((k,v) for k,v in req.headers.items()
3871 if k.lower() not in ("content-length", "content-type"))
3872 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
3874 origin_req_host=req.get_origin_req_host(),
3878 opener = compat_urllib_request.OpenerDirector()
3879 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
3880 HTTPMethodFallback, HEADRedirectHandler,
3881 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
3882 opener.add_handler(handler())
3884 response = opener.open(HeadRequest(url))
3885 new_url = response.geturl()
3890 self.report_following_redirect(new_url)
3891 self._downloader.download([new_url])
3894 def _real_extract(self, url):
3895 if self._test_redirect(url): return
3897 video_id = url.split('/')[-1]
3898 request = compat_urllib_request.Request(url)
3900 self.report_download_webpage(video_id)
3901 webpage = compat_urllib_request.urlopen(request).read()
3902 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3903 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3905 except ValueError as err:
3906 # since this is the last-resort InfoExtractor, if
3907 # this error is thrown, it'll be thrown here
3908 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3911 self.report_extraction(video_id)
3912 # Start with something easy: JW Player in SWFObject
3913 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
3915 # Broaden the search a little bit
3916 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
3918 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3921 # It's possible that one of the regexes
3922 # matched, but returned an empty group:
3923 if mobj.group(1) is None:
3924 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3927 video_url = compat_urllib_parse.unquote(mobj.group(1))
3928 video_id = os.path.basename(video_url)
3930 # here's a fun little line of code for you:
3931 video_extension = os.path.splitext(video_id)[1][1:]
3932 video_id = os.path.splitext(video_id)[0]
3934 # it's tempting to parse this further, but you would
3935 # have to take into account all the variations like
3936 # Video Title - Site Name
3937 # Site Name | Video Title
3938 # Video Title - Tagline | Site Name
3939 # and so on and so forth; it's just not practical
3940 mobj = re.search(r'<title>(.*)</title>', webpage)
3942 self._downloader.trouble(u'ERROR: unable to extract title')
3944 video_title = mobj.group(1)
3946 # video uploader is domain name
3947 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
3949 self._downloader.trouble(u'ERROR: unable to extract title')
3951 video_uploader = mobj.group(1)
3956 'uploader': video_uploader,
3957 'upload_date': None,
3958 'title': video_title,
3959 'ext': video_extension,
3963 class YoutubeSearchIE(InfoExtractor):
3964 """Information Extractor for YouTube search queries."""
3965 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
3966 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
3967 _max_youtube_results = 1000
3968 IE_NAME = u'youtube:search'
3970 def __init__(self, downloader=None):
3971 InfoExtractor.__init__(self, downloader)
3973 def report_download_page(self, query, pagenum):
3974 """Report attempt to download search page with given number."""
3975 query = query.decode(preferredencoding())
3976 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
3978 def _real_extract(self, query):
3979 mobj = re.match(self._VALID_URL, query)
3981 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
3984 prefix, query = query.split(':')
3986 query = query.encode('utf-8')
3988 self._download_n_results(query, 1)
3990 elif prefix == 'all':
3991 self._download_n_results(query, self._max_youtube_results)
3997 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
3999 elif n > self._max_youtube_results:
4000 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
4001 n = self._max_youtube_results
4002 self._download_n_results(query, n)
4004 except ValueError: # parsing prefix as integer fails
4005 self._download_n_results(query, 1)
4008 def _download_n_results(self, query, n):
4009 """Downloads a specified number of results for a query"""
4015 while (50 * pagenum) < limit:
4016 self.report_download_page(query, pagenum+1)
4017 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
4018 request = compat_urllib_request.Request(result_url)
4020 data = compat_urllib_request.urlopen(request).read()
4021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4022 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
4024 api_response = json.loads(data)['data']
4026 new_ids = list(video['id'] for video in api_response['items'])
4027 video_ids += new_ids
4029 limit = min(n, api_response['totalItems'])
4032 if len(video_ids) > n:
4033 video_ids = video_ids[:n]
4034 for id in video_ids:
4035 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
4039 class GoogleSearchIE(InfoExtractor):
4040 """Information Extractor for Google Video search queries."""
4041 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
4042 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
4043 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
4044 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
4045 _max_google_results = 1000
4046 IE_NAME = u'video.google:search'
4048 def __init__(self, downloader=None):
4049 InfoExtractor.__init__(self, downloader)
4051 def report_download_page(self, query, pagenum):
4052 """Report attempt to download playlist page with given number."""
4053 query = query.decode(preferredencoding())
4054 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
4056 def _real_extract(self, query):
4057 mobj = re.match(self._VALID_URL, query)
4059 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
4062 prefix, query = query.split(':')
4064 query = query.encode('utf-8')
4066 self._download_n_results(query, 1)
4068 elif prefix == 'all':
4069 self._download_n_results(query, self._max_google_results)
4075 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
4077 elif n > self._max_google_results:
4078 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
4079 n = self._max_google_results
4080 self._download_n_results(query, n)
4082 except ValueError: # parsing prefix as integer fails
4083 self._download_n_results(query, 1)
4086 def _download_n_results(self, query, n):
4087 """Downloads a specified number of results for a query"""
4093 self.report_download_page(query, pagenum)
4094 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
4095 request = compat_urllib_request.Request(result_url)
4097 page = compat_urllib_request.urlopen(request).read()
4098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4099 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4102 # Extract video identifiers
4103 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
4104 video_id = mobj.group(1)
4105 if video_id not in video_ids:
4106 video_ids.append(video_id)
4107 if len(video_ids) == n:
4108 # Specified n videos reached
4109 for id in video_ids:
4110 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
4113 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
4114 for id in video_ids:
4115 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
4118 pagenum = pagenum + 1
4121 class YahooSearchIE(InfoExtractor):
4122 """Information Extractor for Yahoo! Video search queries."""
4125 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
4126 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
4127 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
4128 _MORE_PAGES_INDICATOR = r'\s*Next'
4129 _max_yahoo_results = 1000
4130 IE_NAME = u'video.yahoo:search'
4132 def __init__(self, downloader=None):
4133 InfoExtractor.__init__(self, downloader)
4135 def report_download_page(self, query, pagenum):
4136 """Report attempt to download playlist page with given number."""
4137 query = query.decode(preferredencoding())
4138 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
4140 def _real_extract(self, query):
4141 mobj = re.match(self._VALID_URL, query)
4143 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
4146 prefix, query = query.split(':')
4148 query = query.encode('utf-8')
4150 self._download_n_results(query, 1)
4152 elif prefix == 'all':
4153 self._download_n_results(query, self._max_yahoo_results)
4159 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
4161 elif n > self._max_yahoo_results:
4162 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
4163 n = self._max_yahoo_results
4164 self._download_n_results(query, n)
4166 except ValueError: # parsing prefix as integer fails
4167 self._download_n_results(query, 1)
4170 def _download_n_results(self, query, n):
4171 """Downloads a specified number of results for a query"""
4174 already_seen = set()
4178 self.report_download_page(query, pagenum)
4179 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
4180 request = compat_urllib_request.Request(result_url)
4182 page = compat_urllib_request.urlopen(request).read()
4183 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4184 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4187 # Extract video identifiers
4188 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
4189 video_id = mobj.group(1)
4190 if video_id not in already_seen:
4191 video_ids.append(video_id)
4192 already_seen.add(video_id)
4193 if len(video_ids) == n:
4194 # Specified n videos reached
4195 for id in video_ids:
4196 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
4199 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
4200 for id in video_ids:
4201 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
4204 pagenum = pagenum + 1
4207 class YoutubePlaylistIE(InfoExtractor):
4208 """Information Extractor for YouTube playlists."""
4210 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
4211 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
4212 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
4213 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
4214 IE_NAME = u'youtube:playlist'
4216 def __init__(self, downloader=None):
4217 InfoExtractor.__init__(self, downloader)
4219 def report_download_page(self, playlist_id, pagenum):
4220 """Report attempt to download playlist page with given number."""
4221 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
4223 def _real_extract(self, url):
4224 # Extract playlist id
4225 mobj = re.match(self._VALID_URL, url)
4227 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4231 if mobj.group(3) is not None:
4232 self._downloader.download([mobj.group(3)])
4235 # Download playlist pages
4236 # prefix is 'p' as default for playlists but there are other types that need extra care
4237 playlist_prefix = mobj.group(1)
4238 if playlist_prefix == 'a':
4239 playlist_access = 'artist'
4241 playlist_prefix = 'p'
4242 playlist_access = 'view_play_list'
4243 playlist_id = mobj.group(2)
4248 self.report_download_page(playlist_id, pagenum)
4249 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
4250 request = compat_urllib_request.Request(url)
4252 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4254 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4257 # Extract video identifiers
4259 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
4260 if mobj.group(1) not in ids_in_page:
4261 ids_in_page.append(mobj.group(1))
4262 video_ids.extend(ids_in_page)
4264 if self._MORE_PAGES_INDICATOR not in page:
4266 pagenum = pagenum + 1
4268 total = len(video_ids)
4270 playliststart = self._downloader.params.get('playliststart', 1) - 1
4271 playlistend = self._downloader.params.get('playlistend', -1)
4272 if playlistend == -1:
4273 video_ids = video_ids[playliststart:]
4275 video_ids = video_ids[playliststart:playlistend]
4277 if len(video_ids) == total:
4278 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
4280 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
4282 for id in video_ids:
4283 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
4287 class YoutubeChannelIE(InfoExtractor):
4288 """Information Extractor for YouTube channels."""
4290 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
4291 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
4292 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
4293 IE_NAME = u'youtube:channel'
4295 def report_download_page(self, channel_id, pagenum):
4296 """Report attempt to download channel page with given number."""
4297 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
4299 def _real_extract(self, url):
4300 # Extract channel id
4301 mobj = re.match(self._VALID_URL, url)
4303 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4306 # Download channel pages
4307 channel_id = mobj.group(1)
4312 self.report_download_page(channel_id, pagenum)
4313 url = self._TEMPLATE_URL % (channel_id, pagenum)
4314 request = compat_urllib_request.Request(url)
4316 page = compat_urllib_request.urlopen(request).read().decode('utf8')
4317 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4318 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4321 # Extract video identifiers
4323 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
4324 if mobj.group(1) not in ids_in_page:
4325 ids_in_page.append(mobj.group(1))
4326 video_ids.extend(ids_in_page)
4328 if self._MORE_PAGES_INDICATOR not in page:
4330 pagenum = pagenum + 1
4332 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
4334 for id in video_ids:
4335 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
4339 class YoutubeUserIE(InfoExtractor):
4340 """Information Extractor for YouTube users."""
4342 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
4343 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
4344 _GDATA_PAGE_SIZE = 50
4345 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
4346 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
4347 IE_NAME = u'youtube:user'
4349 def __init__(self, downloader=None):
4350 InfoExtractor.__init__(self, downloader)
4352 def report_download_page(self, username, start_index):
4353 """Report attempt to download user page."""
4354 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
4355 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
4357 def _real_extract(self, url):
4359 mobj = re.match(self._VALID_URL, url)
4361 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4364 username = mobj.group(1)
4366 # Download video ids using YouTube Data API. Result size per
4367 # query is limited (currently to 50 videos) so we need to query
4368 # page by page until there are no video ids - it means we got
4375 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
4376 self.report_download_page(username, start_index)
4378 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
4381 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4382 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4383 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4386 # Extract video identifiers
4389 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
4390 if mobj.group(1) not in ids_in_page:
4391 ids_in_page.append(mobj.group(1))
4393 video_ids.extend(ids_in_page)
4395 # A little optimization - if current page is not
4396 # "full", ie. does not contain PAGE_SIZE video ids then
4397 # we can assume that this page is the last one - there
4398 # are no more ids on further pages - no need to query
4401 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
4406 all_ids_count = len(video_ids)
4407 playliststart = self._downloader.params.get('playliststart', 1) - 1
4408 playlistend = self._downloader.params.get('playlistend', -1)
4410 if playlistend == -1:
4411 video_ids = video_ids[playliststart:]
4413 video_ids = video_ids[playliststart:playlistend]
4415 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
4416 (username, all_ids_count, len(video_ids)))
4418 for video_id in video_ids:
4419 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
4422 class BlipTVUserIE(InfoExtractor):
4423 """Information Extractor for blip.tv users."""
4425 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
4427 IE_NAME = u'blip.tv:user'
4429 def __init__(self, downloader=None):
4430 InfoExtractor.__init__(self, downloader)
4432 def report_download_page(self, username, pagenum):
4433 """Report attempt to download user page."""
4434 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
4435 (self.IE_NAME, username, pagenum))
4437 def _real_extract(self, url):
4439 mobj = re.match(self._VALID_URL, url)
4441 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
4444 username = mobj.group(1)
4446 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
4448 request = compat_urllib_request.Request(url)
4451 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4452 mobj = re.search(r'data-users-id="([^"]+)"', page)
4453 page_base = page_base % mobj.group(1)
4454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4455 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
4459 # Download video ids using BlipTV Ajax calls. Result size per
4460 # query is limited (currently to 12 videos) so we need to query
4461 # page by page until there are no video ids - it means we got
4468 self.report_download_page(username, pagenum)
4470 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
4473 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
4474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4475 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
4478 # Extract video identifiers
4481 for mobj in re.finditer(r'href="/([^"]+)"', page):
4482 if mobj.group(1) not in ids_in_page:
4483 ids_in_page.append(unescapeHTML(mobj.group(1)))
4485 video_ids.extend(ids_in_page)
4487 # A little optimization - if current page is not
4488 # "full", ie. does not contain PAGE_SIZE video ids then
4489 # we can assume that this page is the last one - there
4490 # are no more ids on further pages - no need to query
4493 if len(ids_in_page) < self._PAGE_SIZE:
4498 all_ids_count = len(video_ids)
4499 playliststart = self._downloader.params.get('playliststart', 1) - 1
4500 playlistend = self._downloader.params.get('playlistend', -1)
4502 if playlistend == -1:
4503 video_ids = video_ids[playliststart:]
4505 video_ids = video_ids[playliststart:playlistend]
4507 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
4508 (self.IE_NAME, username, all_ids_count, len(video_ids)))
4510 for video_id in video_ids:
4511 self._downloader.download([u'http://blip.tv/'+video_id])
4514 class DepositFilesIE(InfoExtractor):
4515 """Information extractor for depositfiles.com"""
4517 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
4518 IE_NAME = u'DepositFiles'
4520 def __init__(self, downloader=None):
4521 InfoExtractor.__init__(self, downloader)
4523 def report_download_webpage(self, file_id):
4524 """Report webpage download."""
4525 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
4527 def report_extraction(self, file_id):
4528 """Report information extraction."""
4529 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
4531 def _real_extract(self, url):
4532 file_id = url.split('/')[-1]
4533 # Rebuild url in english locale
4534 url = 'http://depositfiles.com/en/files/' + file_id
4536 # Retrieve file webpage with 'Free download' button pressed
4537 free_download_indication = { 'gateway_result' : '1' }
4538 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
4540 self.report_download_webpage(file_id)
4541 webpage = compat_urllib_request.urlopen(request).read()
4542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4543 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
4546 # Search for the real file URL
4547 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
4548 if (mobj is None) or (mobj.group(1) is None):
4549 # Try to figure out reason of the error.
4550 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
4551 if (mobj is not None) and (mobj.group(1) is not None):
4552 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
4553 self._downloader.trouble(u'ERROR: %s' % restriction_message)
4555 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
4558 file_url = mobj.group(1)
4559 file_extension = os.path.splitext(file_url)[1][1:]
4561 # Search for file title
4562 mobj = re.search(r'<b title="(.*?)">', webpage)
4564 self._downloader.trouble(u'ERROR: unable to extract title')
4566 file_title = mobj.group(1).decode('utf-8')
4569 'id': file_id.decode('utf-8'),
4570 'url': file_url.decode('utf-8'),
4572 'upload_date': None,
4573 'title': file_title,
4574 'ext': file_extension.decode('utf-8'),
4578 class FacebookIE(InfoExtractor):
4579 """Information Extractor for Facebook"""
4582 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
4583 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
4584 _NETRC_MACHINE = 'facebook'
4585 _available_formats = ['video', 'highqual', 'lowqual']
4586 _video_extensions = {
4591 IE_NAME = u'facebook'
4593 def __init__(self, downloader=None):
4594 InfoExtractor.__init__(self, downloader)
4596 def _reporter(self, message):
4597 """Add header and report message."""
4598 self._downloader.to_screen(u'[facebook] %s' % message)
4600 def report_login(self):
4601 """Report attempt to log in."""
4602 self._reporter(u'Logging in')
4604 def report_video_webpage_download(self, video_id):
4605 """Report attempt to download video webpage."""
4606 self._reporter(u'%s: Downloading video webpage' % video_id)
4608 def report_information_extraction(self, video_id):
4609 """Report attempt to extract video information."""
4610 self._reporter(u'%s: Extracting video information' % video_id)
4612 def _parse_page(self, video_webpage):
4613 """Extract video information from page"""
4615 data = {'title': r'\("video_title", "(.*?)"\)',
4616 'description': r'<div class="datawrap">(.*?)</div>',
4617 'owner': r'\("video_owner_name", "(.*?)"\)',
4618 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
4621 for piece in data.keys():
4622 mobj = re.search(data[piece], video_webpage)
4623 if mobj is not None:
4624 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
4628 for fmt in self._available_formats:
4629 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
4630 if mobj is not None:
4631 # URL is in a Javascript segment inside an escaped Unicode format within
4632 # the generally utf-8 page
4633 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
4634 video_info['video_urls'] = video_urls
4638 def _real_initialize(self):
4639 if self._downloader is None:
4644 downloader_params = self._downloader.params
4646 # Attempt to use provided username and password or .netrc data
4647 if downloader_params.get('username', None) is not None:
4648 useremail = downloader_params['username']
4649 password = downloader_params['password']
4650 elif downloader_params.get('usenetrc', False):
4652 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
4653 if info is not None:
4657 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
4658 except (IOError, netrc.NetrcParseError) as err:
4659 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
4662 if useremail is None:
4671 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
4674 login_results = compat_urllib_request.urlopen(request).read()
4675 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
4676 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
4678 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4679 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
4682 def _real_extract(self, url):
4683 mobj = re.match(self._VALID_URL, url)
4685 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4687 video_id = mobj.group('ID')
4690 self.report_video_webpage_download(video_id)
4691 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
4693 page = compat_urllib_request.urlopen(request)
4694 video_webpage = page.read()
4695 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4696 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
4699 # Start extracting information
4700 self.report_information_extraction(video_id)
4702 # Extract information
4703 video_info = self._parse_page(video_webpage)
4706 if 'owner' not in video_info:
4707 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
4709 video_uploader = video_info['owner']
4712 if 'title' not in video_info:
4713 self._downloader.trouble(u'ERROR: unable to extract video title')
4715 video_title = video_info['title']
4716 video_title = video_title.decode('utf-8')
4719 if 'thumbnail' not in video_info:
4720 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
4721 video_thumbnail = ''
4723 video_thumbnail = video_info['thumbnail']
4727 if 'upload_date' in video_info:
4728 upload_time = video_info['upload_date']
4729 timetuple = email.utils.parsedate_tz(upload_time)
4730 if timetuple is not None:
4732 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
4737 video_description = video_info.get('description', 'No description available.')
4739 url_map = video_info['video_urls']
4741 # Decide which formats to download
4742 req_format = self._downloader.params.get('format', None)
4743 format_limit = self._downloader.params.get('format_limit', None)
4745 if format_limit is not None and format_limit in self._available_formats:
4746 format_list = self._available_formats[self._available_formats.index(format_limit):]
4748 format_list = self._available_formats
4749 existing_formats = [x for x in format_list if x in url_map]
4750 if len(existing_formats) == 0:
4751 self._downloader.trouble(u'ERROR: no known formats available for video')
4753 if req_format is None:
4754 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
4755 elif req_format == 'worst':
4756 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
4757 elif req_format == '-1':
4758 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
4761 if req_format not in url_map:
4762 self._downloader.trouble(u'ERROR: requested format not available')
4764 video_url_list = [(req_format, url_map[req_format])] # Specific format
4767 for format_param, video_real_url in video_url_list:
4769 video_extension = self._video_extensions.get(format_param, 'mp4')
4772 'id': video_id.decode('utf-8'),
4773 'url': video_real_url.decode('utf-8'),
4774 'uploader': video_uploader.decode('utf-8'),
4775 'upload_date': upload_date,
4776 'title': video_title,
4777 'ext': video_extension.decode('utf-8'),
4778 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
4779 'thumbnail': video_thumbnail.decode('utf-8'),
4780 'description': video_description.decode('utf-8'),
4784 class BlipTVIE(InfoExtractor):
4785 """Information extractor for blip.tv"""
4787 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
4788 _URL_EXT = r'^.*\.([a-z0-9]+)$'
4789 IE_NAME = u'blip.tv'
4791 def report_extraction(self, file_id):
4792 """Report information extraction."""
4793 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
4795 def report_direct_download(self, title):
4796 """Report information extraction."""
4797 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
4799 def _real_extract(self, url):
4800 mobj = re.match(self._VALID_URL, url)
4802 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4809 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
4810 request = compat_urllib_request.Request(json_url)
4811 self.report_extraction(mobj.group(1))
4814 urlh = compat_urllib_request.urlopen(request)
4815 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
4816 basename = url.split('/')[-1]
4817 title,ext = os.path.splitext(basename)
4818 title = title.decode('UTF-8')
4819 ext = ext.replace('.', '')
4820 self.report_direct_download(title)
4825 'upload_date': None,
4830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4831 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
4833 if info is None: # Regular URL
4835 json_code_bytes = urlh.read()
4836 json_code = json_code_bytes.decode('utf-8')
4837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4838 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
4842 json_data = json.loads(json_code)
4843 if 'Post' in json_data:
4844 data = json_data['Post']
4848 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
4849 video_url = data['media']['url']
4850 umobj = re.match(self._URL_EXT, video_url)
4852 raise ValueError('Can not determine filename extension')
4853 ext = umobj.group(1)
4856 'id': data['item_id'],
4858 'uploader': data['display_name'],
4859 'upload_date': upload_date,
4860 'title': data['title'],
4862 'format': data['media']['mimeType'],
4863 'thumbnail': data['thumbnailUrl'],
4864 'description': data['description'],
4865 'player_url': data['embedUrl']
4867 except (ValueError,KeyError) as err:
4868 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
4871 std_headers['User-Agent'] = 'iTunes/10.6.1'
4875 class MyVideoIE(InfoExtractor):
4876 """Information Extractor for myvideo.de."""
4878 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
4879 IE_NAME = u'myvideo'
4881 def __init__(self, downloader=None):
4882 InfoExtractor.__init__(self, downloader)
4884 def report_download_webpage(self, video_id):
4885 """Report webpage download."""
4886 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
4888 def report_extraction(self, video_id):
4889 """Report information extraction."""
4890 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
4892 def _real_extract(self,url):
4893 mobj = re.match(self._VALID_URL, url)
4895 self._download.trouble(u'ERROR: invalid URL: %s' % url)
4898 video_id = mobj.group(1)
4901 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
4903 self.report_download_webpage(video_id)
4904 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
4905 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
4906 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
4909 self.report_extraction(video_id)
4910 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
4913 self._downloader.trouble(u'ERROR: unable to extract media URL')
4915 video_url = mobj.group(1) + ('/%s.flv' % video_id)
4917 mobj = re.search('<title>([^<]+)</title>', webpage)
4919 self._downloader.trouble(u'ERROR: unable to extract title')
4922 video_title = mobj.group(1)
4928 'upload_date': None,
4929 'title': video_title,
4933 class ComedyCentralIE(InfoExtractor):
4934 """Information extractor for The Daily Show and Colbert Report """
4936 # urls can be abbreviations like :thedailyshow or :colbert
4937 # urls for episodes like:
4938 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
4939 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
4940 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
4941 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
4942 |(https?://)?(www\.)?
4943 (?P<showname>thedailyshow|colbertnation)\.com/
4944 (full-episodes/(?P<episode>.*)|
4946 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
4947 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
4949 IE_NAME = u'comedycentral'
4951 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
4953 _video_extensions = {
4961 _video_dimensions = {
4970 def suitable(self, url):
4971 """Receives a URL and returns True if suitable for this IE."""
4972 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4974 def report_extraction(self, episode_id):
4975 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
4977 def report_config_download(self, episode_id):
4978 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
4980 def report_index_download(self, episode_id):
4981 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
4983 def report_player_url(self, episode_id):
4984 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
4987 def _print_formats(self, formats):
4988 print('Available formats:')
4990 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
4993 def _real_extract(self, url):
4994 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
4996 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4999 if mobj.group('shortname'):
5000 if mobj.group('shortname') in ('tds', 'thedailyshow'):
5001 url = u'http://www.thedailyshow.com/full-episodes/'
5003 url = u'http://www.colbertnation.com/full-episodes/'
5004 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
5005 assert mobj is not None
5007 if mobj.group('clip'):
5008 if mobj.group('showname') == 'thedailyshow':
5009 epTitle = mobj.group('tdstitle')
5011 epTitle = mobj.group('cntitle')
5014 dlNewest = not mobj.group('episode')
5016 epTitle = mobj.group('showname')
5018 epTitle = mobj.group('episode')
5020 req = compat_urllib_request.Request(url)
5021 self.report_extraction(epTitle)
5023 htmlHandle = compat_urllib_request.urlopen(req)
5024 html = htmlHandle.read()
5025 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5026 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
5029 url = htmlHandle.geturl()
5030 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
5032 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
5034 if mobj.group('episode') == '':
5035 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
5037 epTitle = mobj.group('episode')
5039 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
5041 if len(mMovieParams) == 0:
5042 # The Colbert Report embeds the information in a without
5043 # a URL prefix; so extract the alternate reference
5044 # and then add the URL prefix manually.
5046 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
5047 if len(altMovieParams) == 0:
5048 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
5051 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
5053 playerUrl_raw = mMovieParams[0][0]
5054 self.report_player_url(epTitle)
5056 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
5057 playerUrl = urlHandle.geturl()
5058 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5059 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
5062 uri = mMovieParams[0][1]
5063 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
5064 self.report_index_download(epTitle)
5066 indexXml = compat_urllib_request.urlopen(indexUrl).read()
5067 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5068 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
5073 idoc = xml.etree.ElementTree.fromstring(indexXml)
5074 itemEls = idoc.findall('.//item')
5075 for itemEl in itemEls:
5076 mediaId = itemEl.findall('./guid')[0].text
5077 shortMediaId = mediaId.split(':')[-1]
5078 showId = mediaId.split(':')[-2].replace('.com', '')
5079 officialTitle = itemEl.findall('./title')[0].text
5080 officialDate = itemEl.findall('./pubDate')[0].text
5082 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
5083 compat_urllib_parse.urlencode({'uri': mediaId}))
5084 configReq = compat_urllib_request.Request(configUrl)
5085 self.report_config_download(epTitle)
5087 configXml = compat_urllib_request.urlopen(configReq).read()
5088 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5089 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
5092 cdoc = xml.etree.ElementTree.fromstring(configXml)
5094 for rendition in cdoc.findall('.//rendition'):
5095 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
5099 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
5102 if self._downloader.params.get('listformats', None):
5103 self._print_formats([i[0] for i in turls])
5106 # For now, just pick the highest bitrate
5107 format,video_url = turls[-1]
5109 # Get the format arg from the arg stream
5110 req_format = self._downloader.params.get('format', None)
5112 # Select format if we can find one
5115 format, video_url = f, v
5118 # Patch to download from alternative CDN, which does not
5119 # break on current RTMPDump builds
5120 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
5121 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
5123 if video_url.startswith(broken_cdn):
5124 video_url = video_url.replace(broken_cdn, better_cdn)
5126 effTitle = showId + u'-' + epTitle
5131 'upload_date': officialDate,
5136 'description': officialTitle,
5137 'player_url': None #playerUrl
5140 results.append(info)
5145 class EscapistIE(InfoExtractor):
5146 """Information extractor for The Escapist """
5148 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
5149 IE_NAME = u'escapist'
5151 def report_extraction(self, showName):
5152 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
5154 def report_config_download(self, showName):
5155 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
5157 def _real_extract(self, url):
5158 mobj = re.match(self._VALID_URL, url)
5160 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5162 showName = mobj.group('showname')
5163 videoId = mobj.group('episode')
5165 self.report_extraction(showName)
5167 webPage = compat_urllib_request.urlopen(url)
5168 webPageBytes = webPage.read()
5169 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
5170 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
5171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5172 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
5175 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
5176 description = unescapeHTML(descMatch.group(1))
5177 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
5178 imgUrl = unescapeHTML(imgMatch.group(1))
5179 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
5180 playerUrl = unescapeHTML(playerUrlMatch.group(1))
5181 configUrlMatch = re.search('config=(.*)$', playerUrl)
5182 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
5184 self.report_config_download(showName)
5186 configJSON = compat_urllib_request.urlopen(configUrl)
5187 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
5188 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
5189 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5190 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
5193 # Technically, it's JavaScript, not JSON
5194 configJSON = configJSON.replace("'", '"')
5197 config = json.loads(configJSON)
5198 except (ValueError,) as err:
5199 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
5202 playlist = config['playlist']
5203 videoUrl = playlist[1]['url']
5208 'uploader': showName,
5209 'upload_date': None,
5212 'thumbnail': imgUrl,
5213 'description': description,
5214 'player_url': playerUrl,
5220 class CollegeHumorIE(InfoExtractor):
5221 """Information extractor for collegehumor.com"""
5224 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
5225 IE_NAME = u'collegehumor'
5227 def report_manifest(self, video_id):
5228 """Report information extraction."""
5229 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
5231 def report_extraction(self, video_id):
5232 """Report information extraction."""
5233 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5235 def _real_extract(self, url):
5236 mobj = re.match(self._VALID_URL, url)
5238 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5240 video_id = mobj.group('videoid')
5245 'upload_date': None,
5248 self.report_extraction(video_id)
5249 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
5251 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
5252 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5253 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
5256 mdoc = xml.etree.ElementTree.fromstring(metaXml)
5258 videoNode = mdoc.findall('./video')[0]
5259 info['description'] = videoNode.findall('./description')[0].text
5260 info['title'] = videoNode.findall('./caption')[0].text
5261 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
5262 manifest_url = videoNode.findall('./file')[0].text
5264 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
5267 manifest_url += '?hdcore=2.10.3'
5268 self.report_manifest(video_id)
5270 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
5271 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5272 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
5275 adoc = xml.etree.ElementTree.fromstring(manifestXml)
5277 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
5278 node_id = media_node.attrib['url']
5279 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
5280 except IndexError as err:
5281 self._downloader.trouble(u'\nERROR: Invalid manifest file')
5284 url_pr = compat_urllib_parse_urlparse(manifest_url)
5285 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
5292 class XVideosIE(InfoExtractor):
5293 """Information extractor for xvideos.com"""
5295 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
5296 IE_NAME = u'xvideos'
5298 def report_webpage(self, video_id):
5299 """Report information extraction."""
5300 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5302 def report_extraction(self, video_id):
5303 """Report information extraction."""
5304 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5306 def _real_extract(self, url):
5307 mobj = re.match(self._VALID_URL, url)
5309 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5311 video_id = mobj.group(1)
5313 self.report_webpage(video_id)
5315 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
5317 webpage_bytes = compat_urllib_request.urlopen(request).read()
5318 webpage = webpage_bytes.decode('utf-8', 'replace')
5319 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5320 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5323 self.report_extraction(video_id)
5327 mobj = re.search(r'flv_url=(.+?)&', webpage)
5329 self._downloader.trouble(u'ERROR: unable to extract video url')
5331 video_url = compat_urllib_parse.unquote(mobj.group(1))
5335 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
5337 self._downloader.trouble(u'ERROR: unable to extract video title')
5339 video_title = mobj.group(1)
5342 # Extract video thumbnail
5343 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
5345 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
5347 video_thumbnail = mobj.group(0)
5353 'upload_date': None,
5354 'title': video_title,
5356 'thumbnail': video_thumbnail,
5357 'description': None,
5363 class SoundcloudIE(InfoExtractor):
5364 """Information extractor for soundcloud.com
5365 To access the media, the uid of the song and a stream token
5366 must be extracted from the page source and the script must make
5367 a request to media.soundcloud.com/crossdomain.xml. Then
5368 the media can be grabbed by requesting from an url composed
5369 of the stream token and uid
5372 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
5373 IE_NAME = u'soundcloud'
5375 def __init__(self, downloader=None):
5376 InfoExtractor.__init__(self, downloader)
5378 def report_resolve(self, video_id):
5379 """Report information extraction."""
5380 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
5382 def report_extraction(self, video_id):
5383 """Report information extraction."""
5384 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
5386 def _real_extract(self, url):
5387 mobj = re.match(self._VALID_URL, url)
5389 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5392 # extract uploader (which is in the url)
5393 uploader = mobj.group(1)
5394 # extract simple title (uploader + slug of song title)
5395 slug_title = mobj.group(2)
5396 simple_title = uploader + u'-' + slug_title
5398 self.report_resolve('%s/%s' % (uploader, slug_title))
5400 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
5401 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
5402 request = compat_urllib_request.Request(resolv_url)
5404 info_json_bytes = compat_urllib_request.urlopen(request).read()
5405 info_json = info_json_bytes.decode('utf-8')
5406 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5407 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5410 info = json.loads(info_json)
5411 video_id = info['id']
5412 self.report_extraction('%s/%s' % (uploader, slug_title))
5414 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
5415 request = compat_urllib_request.Request(streams_url)
5417 stream_json_bytes = compat_urllib_request.urlopen(request).read()
5418 stream_json = stream_json_bytes.decode('utf-8')
5419 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5420 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5423 streams = json.loads(stream_json)
5424 mediaURL = streams['http_mp3_128_url']
5429 'uploader': info['user']['username'],
5430 'upload_date': info['created_at'],
5431 'title': info['title'],
5433 'description': info['description'],
5437 class InfoQIE(InfoExtractor):
5438 """Information extractor for infoq.com"""
5440 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
5443 def report_webpage(self, video_id):
5444 """Report information extraction."""
5445 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5447 def report_extraction(self, video_id):
5448 """Report information extraction."""
5449 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5451 def _real_extract(self, url):
5452 mobj = re.match(self._VALID_URL, url)
5454 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5457 self.report_webpage(url)
5459 request = compat_urllib_request.Request(url)
5461 webpage = compat_urllib_request.urlopen(request).read()
5462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5463 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5466 self.report_extraction(url)
5470 mobj = re.search(r"jsclassref='([^']*)'", webpage)
5472 self._downloader.trouble(u'ERROR: unable to extract video url')
5474 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
5478 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
5480 self._downloader.trouble(u'ERROR: unable to extract video title')
5482 video_title = mobj.group(1).decode('utf-8')
5484 # Extract description
5485 video_description = u'No description available.'
5486 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
5487 if mobj is not None:
5488 video_description = mobj.group(1).decode('utf-8')
5490 video_filename = video_url.split('/')[-1]
5491 video_id, extension = video_filename.split('.')
5497 'upload_date': None,
5498 'title': video_title,
5499 'ext': extension, # Extension is always(?) mp4, but seems to be flv
5501 'description': video_description,
5506 class MixcloudIE(InfoExtractor):
5507 """Information extractor for www.mixcloud.com"""
5509 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
5510 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
5511 IE_NAME = u'mixcloud'
5513 def __init__(self, downloader=None):
5514 InfoExtractor.__init__(self, downloader)
5516 def report_download_json(self, file_id):
5517 """Report JSON download."""
5518 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
5520 def report_extraction(self, file_id):
5521 """Report information extraction."""
5522 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
5524 def get_urls(self, jsonData, fmt, bitrate='best'):
5525 """Get urls from 'audio_formats' section in json"""
5528 bitrate_list = jsonData[fmt]
5529 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
5530 bitrate = max(bitrate_list) # select highest
5532 url_list = jsonData[fmt][bitrate]
5533 except TypeError: # we have no bitrate info.
5534 url_list = jsonData[fmt]
5537 def check_urls(self, url_list):
5538 """Returns 1st active url from list"""
5539 for url in url_list:
5541 compat_urllib_request.urlopen(url)
5543 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5548 def _print_formats(self, formats):
5549 print('Available formats:')
5550 for fmt in formats.keys():
5551 for b in formats[fmt]:
5553 ext = formats[fmt][b][0]
5554 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
5555 except TypeError: # we have no bitrate info
5556 ext = formats[fmt][0]
5557 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
5560 def _real_extract(self, url):
5561 mobj = re.match(self._VALID_URL, url)
5563 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5565 # extract uploader & filename from url
5566 uploader = mobj.group(1).decode('utf-8')
5567 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
5569 # construct API request
5570 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
5571 # retrieve .json file with links to files
5572 request = compat_urllib_request.Request(file_url)
5574 self.report_download_json(file_url)
5575 jsonData = compat_urllib_request.urlopen(request).read()
5576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5577 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
5581 json_data = json.loads(jsonData)
5582 player_url = json_data['player_swf_url']
5583 formats = dict(json_data['audio_formats'])
5585 req_format = self._downloader.params.get('format', None)
5588 if self._downloader.params.get('listformats', None):
5589 self._print_formats(formats)
5592 if req_format is None or req_format == 'best':
5593 for format_param in formats.keys():
5594 url_list = self.get_urls(formats, format_param)
5596 file_url = self.check_urls(url_list)
5597 if file_url is not None:
5600 if req_format not in formats:
5601 self._downloader.trouble(u'ERROR: format is not available')
5604 url_list = self.get_urls(formats, req_format)
5605 file_url = self.check_urls(url_list)
5606 format_param = req_format
5609 'id': file_id.decode('utf-8'),
5610 'url': file_url.decode('utf-8'),
5611 'uploader': uploader.decode('utf-8'),
5612 'upload_date': None,
5613 'title': json_data['name'],
5614 'ext': file_url.split('.')[-1].decode('utf-8'),
5615 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
5616 'thumbnail': json_data['thumbnail_url'],
5617 'description': json_data['description'],
5618 'player_url': player_url.decode('utf-8'),
5621 class StanfordOpenClassroomIE(InfoExtractor):
5622 """Information extractor for Stanford's Open ClassRoom"""
5624 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
5625 IE_NAME = u'stanfordoc'
5627 def report_download_webpage(self, objid):
5628 """Report information extraction."""
5629 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
5631 def report_extraction(self, video_id):
5632 """Report information extraction."""
5633 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5635 def _real_extract(self, url):
5636 mobj = re.match(self._VALID_URL, url)
5638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5641 if mobj.group('course') and mobj.group('video'): # A specific video
5642 course = mobj.group('course')
5643 video = mobj.group('video')
5645 'id': course + '_' + video,
5647 'upload_date': None,
5650 self.report_extraction(info['id'])
5651 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
5652 xmlUrl = baseUrl + video + '.xml'
5654 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
5655 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5656 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
5658 mdoc = xml.etree.ElementTree.fromstring(metaXml)
5660 info['title'] = mdoc.findall('./title')[0].text
5661 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
5663 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
5665 info['ext'] = info['url'].rpartition('.')[2]
5667 elif mobj.group('course'): # A course page
5668 course = mobj.group('course')
5673 'upload_date': None,
5676 self.report_download_webpage(info['id'])
5678 coursepage = compat_urllib_request.urlopen(url).read()
5679 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5680 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
5683 m = re.search('<h1>([^<]+)</h1>', coursepage)
5685 info['title'] = unescapeHTML(m.group(1))
5687 info['title'] = info['id']
5689 m = re.search('<description>([^<]+)</description>', coursepage)
5691 info['description'] = unescapeHTML(m.group(1))
5693 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
5696 'type': 'reference',
5697 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
5701 for entry in info['list']:
5702 assert entry['type'] == 'reference'
5703 results += self.extract(entry['url'])
5708 'id': 'Stanford OpenClassroom',
5711 'upload_date': None,
5714 self.report_download_webpage(info['id'])
5715 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
5717 rootpage = compat_urllib_request.urlopen(rootURL).read()
5718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5719 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
5722 info['title'] = info['id']
5724 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
5727 'type': 'reference',
5728 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
5733 for entry in info['list']:
5734 assert entry['type'] == 'reference'
5735 results += self.extract(entry['url'])
5738 class MTVIE(InfoExtractor):
5739 """Information extractor for MTV.com"""
5741 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
5744 def report_webpage(self, video_id):
5745 """Report information extraction."""
5746 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5748 def report_extraction(self, video_id):
5749 """Report information extraction."""
5750 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5752 def _real_extract(self, url):
5753 mobj = re.match(self._VALID_URL, url)
5755 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5757 if not mobj.group('proto'):
5758 url = 'http://' + url
5759 video_id = mobj.group('videoid')
5760 self.report_webpage(video_id)
5762 request = compat_urllib_request.Request(url)
5764 webpage = compat_urllib_request.urlopen(request).read()
5765 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5766 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
5769 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
5771 self._downloader.trouble(u'ERROR: unable to extract song name')
5773 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
5774 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
5776 self._downloader.trouble(u'ERROR: unable to extract performer')
5778 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
5779 video_title = performer + ' - ' + song_name
5781 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
5783 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
5785 mtvn_uri = mobj.group(1)
5787 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
5789 self._downloader.trouble(u'ERROR: unable to extract content id')
5791 content_id = mobj.group(1)
5793 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
5794 self.report_extraction(video_id)
5795 request = compat_urllib_request.Request(videogen_url)
5797 metadataXml = compat_urllib_request.urlopen(request).read()
5798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5799 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
5802 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
5803 renditions = mdoc.findall('.//rendition')
5805 # For now, always pick the highest quality.
5806 rendition = renditions[-1]
5809 _,_,ext = rendition.attrib['type'].partition('/')
5810 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
5811 video_url = rendition.find('./src').text
5813 self._downloader.trouble('Invalid rendition field.')
5819 'uploader': performer,
5820 'upload_date': None,
5821 'title': video_title,
5829 class YoukuIE(InfoExtractor):
5831 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
5834 def __init__(self, downloader=None):
5835 InfoExtractor.__init__(self, downloader)
5837 def report_download_webpage(self, file_id):
5838 """Report webpage download."""
5839 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
5841 def report_extraction(self, file_id):
5842 """Report information extraction."""
5843 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
5846 nowTime = int(time.time() * 1000)
5847 random1 = random.randint(1000,1998)
5848 random2 = random.randint(1000,9999)
5850 return "%d%d%d" %(nowTime,random1,random2)
5852 def _get_file_ID_mix_string(self, seed):
5854 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
5856 for i in range(len(source)):
5857 seed = (seed * 211 + 30031 ) % 65536
5858 index = math.floor(seed / 65536 * len(source) )
5859 mixed.append(source[int(index)])
5860 source.remove(source[int(index)])
5861 #return ''.join(mixed)
5864 def _get_file_id(self, fileId, seed):
5865 mixed = self._get_file_ID_mix_string(seed)
5866 ids = fileId.split('*')
5870 realId.append(mixed[int(ch)])
5871 return ''.join(realId)
5873 def _real_extract(self, url):
5874 mobj = re.match(self._VALID_URL, url)
5876 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5878 video_id = mobj.group('ID')
5880 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
5882 request = compat_urllib_request.Request(info_url, None, std_headers)
5884 self.report_download_webpage(video_id)
5885 jsondata = compat_urllib_request.urlopen(request).read()
5886 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5887 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
5890 self.report_extraction(video_id)
5892 jsonstr = jsondata.decode('utf-8')
5893 config = json.loads(jsonstr)
5895 video_title = config['data'][0]['title']
5896 seed = config['data'][0]['seed']
5898 format = self._downloader.params.get('format', None)
5899 supported_format = list(config['data'][0]['streamfileids'].keys())
5901 if format is None or format == 'best':
5902 if 'hd2' in supported_format:
5907 elif format == 'worst':
5915 fileid = config['data'][0]['streamfileids'][format]
5916 keys = [s['k'] for s in config['data'][0]['segs'][format]]
5917 except (UnicodeDecodeError, ValueError, KeyError):
5918 self._downloader.trouble(u'ERROR: unable to extract info section')
5922 sid = self._gen_sid()
5923 fileid = self._get_file_id(fileid, seed)
5925 #column 8,9 of fileid represent the segment number
5926 #fileid[7:9] should be changed
5927 for index, key in enumerate(keys):
5929 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
5930 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
5933 'id': '%s_part%02d' % (video_id, index),
5934 'url': download_url,
5936 'upload_date': None,
5937 'title': video_title,
5940 files_info.append(info)
5945 class XNXXIE(InfoExtractor):
5946 """Information extractor for xnxx.com"""
5948 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
5950 VIDEO_URL_RE = r'flv_url=(.*?)&'
5951 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
5952 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
5954 def report_webpage(self, video_id):
5955 """Report information extraction"""
5956 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
5958 def report_extraction(self, video_id):
5959 """Report information extraction"""
5960 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
5962 def _real_extract(self, url):
5963 mobj = re.match(self._VALID_URL, url)
5965 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
5967 video_id = mobj.group(1)
5969 self.report_webpage(video_id)
5971 # Get webpage content
5973 webpage_bytes = compat_urllib_request.urlopen(url).read()
5974 webpage = webpage_bytes.decode('utf-8')
5975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
5976 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
5979 result = re.search(self.VIDEO_URL_RE, webpage)
5981 self._downloader.trouble(u'ERROR: unable to extract video url')
5983 video_url = compat_urllib_parse.unquote(result.group(1))
5985 result = re.search(self.VIDEO_TITLE_RE, webpage)
5987 self._downloader.trouble(u'ERROR: unable to extract video title')
5989 video_title = result.group(1)
5991 result = re.search(self.VIDEO_THUMB_RE, webpage)
5993 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
5995 video_thumbnail = result.group(1)
6001 'upload_date': None,
6002 'title': video_title,
6004 'thumbnail': video_thumbnail,
6005 'description': None,
6009 class GooglePlusIE(InfoExtractor):
6010 """Information extractor for plus.google.com."""
6012 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
6013 IE_NAME = u'plus.google'
6015 def __init__(self, downloader=None):
6016 InfoExtractor.__init__(self, downloader)
6018 def report_extract_entry(self, url):
6019 """Report downloading extry"""
6020 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
6022 def report_date(self, upload_date):
6023 """Report downloading extry"""
6024 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
6026 def report_uploader(self, uploader):
6027 """Report downloading extry"""
6028 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
6030 def report_title(self, video_title):
6031 """Report downloading extry"""
6032 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
6034 def report_extract_vid_page(self, video_page):
6035 """Report information extraction."""
6036 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
6038 def _real_extract(self, url):
6039 # Extract id from URL
6040 mobj = re.match(self._VALID_URL, url)
6042 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
6045 post_url = mobj.group(0)
6046 video_id = mobj.group(1)
6048 video_extension = 'flv'
6050 # Step 1, Retrieve post webpage to extract further information
6051 self.report_extract_entry(post_url)
6052 request = compat_urllib_request.Request(post_url)
6054 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
6055 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6056 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
6059 # Extract update date
6061 pattern = 'title="Timestamp">(.*?)</a>'
6062 mobj = re.search(pattern, webpage)
6064 upload_date = mobj.group(1)
6065 # Convert timestring to a format suitable for filename
6066 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
6067 upload_date = upload_date.strftime('%Y%m%d')
6068 self.report_date(upload_date)
6072 pattern = r'rel\="author".*?>(.*?)</a>'
6073 mobj = re.search(pattern, webpage)
6075 uploader = mobj.group(1)
6076 self.report_uploader(uploader)
6079 # Get the first line for title
6081 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
6082 mobj = re.search(pattern, webpage)
6084 video_title = mobj.group(1)
6085 self.report_title(video_title)
6087 # Step 2, Stimulate clicking the image box to launch video
6088 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
6089 mobj = re.search(pattern, webpage)
6091 self._downloader.trouble(u'ERROR: unable to extract video page URL')
6093 video_page = mobj.group(1)
6094 request = compat_urllib_request.Request(video_page)
6096 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
6097 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6098 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
6100 self.report_extract_vid_page(video_page)
6103 # Extract video links on video page
6104 """Extract video links of all sizes"""
6105 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
6106 mobj = re.findall(pattern, webpage)
6108 self._downloader.trouble(u'ERROR: unable to extract video links')
6110 # Sort in resolution
6111 links = sorted(mobj)
6113 # Choose the lowest of the sort, i.e. highest resolution
6114 video_url = links[-1]
6115 # Only get the url. The resolution part in the tuple has no use anymore
6116 video_url = video_url[-1]
6117 # Treat escaped \u0026 style hex
6119 video_url = video_url.decode("unicode_escape")
6120 except AttributeError: # Python 3
6121 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
6127 'uploader': uploader,
6128 'upload_date': upload_date,
6129 'title': video_title,
6130 'ext': video_extension,
6133 class NBAIE(InfoExtractor):
6134 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
6137 def report_extraction(self, video_id):
6138 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
6140 def _real_extract(self, url):
6141 mobj = re.match(self._VALID_URL, url)
6143 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6146 video_id = mobj.group(1)
6147 if video_id.endswith('/index.html'):
6148 video_id = video_id[:-len('/index.html')]
6150 self.report_extraction(video_id)
6152 urlh = compat_urllib_request.urlopen(url)
6153 webpage_bytes = urlh.read()
6154 webpage = webpage_bytes.decode('utf-8', 'ignore')
6155 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6156 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
6159 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
6160 def _findProp(rexp, default=None):
6161 m = re.search(rexp, webpage)
6163 return unescapeHTML(m.group(1))
6167 shortened_video_id = video_id.rpartition('/')[2]
6168 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
6170 'id': shortened_video_id,
6174 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
6175 'description': _findProp(r'<div class="description">(.*?)</h1>'),
6179 class JustinTVIE(InfoExtractor):
6180 """Information extractor for justin.tv and twitch.tv"""
6181 # TODO: One broadcast may be split into multiple videos. The key
6182 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
6183 # starts at 1 and increases. Can we treat all parts as one video?
6185 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
6186 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
6187 _JUSTIN_PAGE_LIMIT = 100
6188 IE_NAME = u'justin.tv'
6190 def report_extraction(self, file_id):
6191 """Report information extraction."""
6192 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
6194 def report_download_page(self, channel, offset):
6195 """Report attempt to download a single page of videos."""
6196 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
6197 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
6199 # Return count of items, list of *valid* items
6200 def _parse_page(self, url):
6202 urlh = compat_urllib_request.urlopen(url)
6203 webpage_bytes = urlh.read()
6204 webpage = webpage_bytes.decode('utf-8', 'ignore')
6205 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6206 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
6209 response = json.loads(webpage)
6211 for clip in response:
6212 video_url = clip['video_file_url']
6214 video_extension = os.path.splitext(video_url)[1][1:]
6215 video_date = re.sub('-', '', clip['created_on'][:10])
6219 'title': clip['title'],
6220 'uploader': clip.get('user_id', clip.get('channel_id')),
6221 'upload_date': video_date,
6222 'ext': video_extension,
6224 return (len(response), info)
6226 def _real_extract(self, url):
6227 mobj = re.match(self._VALID_URL, url)
6229 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6232 api = 'http://api.justin.tv'
6233 video_id = mobj.group(mobj.lastindex)
6235 if mobj.lastindex == 1:
6237 api += '/channel/archives/%s.json'
6239 api += '/clip/show/%s.json'
6240 api = api % (video_id,)
6242 self.report_extraction(video_id)
6246 limit = self._JUSTIN_PAGE_LIMIT
6249 self.report_download_page(video_id, offset)
6250 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
6251 page_count, page_info = self._parse_page(page_url)
6252 info.extend(page_info)
6253 if not paged or page_count != limit:
6258 class FunnyOrDieIE(InfoExtractor):
6259 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
6260 IE_NAME = u'FunnyOrDie'
6262 def report_extraction(self, video_id):
6263 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
6265 def _real_extract(self, url):
6266 mobj = re.match(self._VALID_URL, url)
6268 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6271 video_id = mobj.group('id')
6272 self.report_extraction(video_id)
6274 urlh = compat_urllib_request.urlopen(url)
6275 webpage_bytes = urlh.read()
6276 webpage = webpage_bytes.decode('utf-8', 'ignore')
6277 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6278 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
6281 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
6283 self._downloader.trouble(u'ERROR: unable to find video information')
6284 video_url = unescapeHTML(m.group('url'))
6286 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
6288 self._downloader.trouble(u'Cannot find video title')
6289 title = unescapeHTML(m.group('title'))
6291 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
6293 desc = unescapeHTML(m.group('desc'))
6302 'description': desc,
6306 class TweetReelIE(InfoExtractor):
6307 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
6309 def report_extraction(self, video_id):
6310 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
6312 def _real_extract(self, url):
6313 mobj = re.match(self._VALID_URL, url)
6315 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
6318 video_id = mobj.group('id')
6319 self.report_extraction(video_id)
6321 urlh = compat_urllib_request.urlopen(url)
6322 webpage_bytes = urlh.read()
6323 webpage = webpage_bytes.decode('utf-8', 'ignore')
6324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
6325 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
6328 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
6330 self._downloader.trouble(u'ERROR: Cannot find status ID')
6331 status_id = m.group(1)
6333 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
6335 self._downloader.trouble(u'WARNING: Cannot find description')
6336 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
6338 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
6340 self._downloader.trouble(u'ERROR: Cannot find uploader')
6341 uploader = unescapeHTML(m.group('uploader'))
6342 uploader_id = unescapeHTML(m.group('uploader_id'))
6344 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
6346 self._downloader.trouble(u'ERROR: Cannot find upload date')
6347 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
6350 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
6357 'description': desc,
6358 'uploader': uploader,
6359 'uploader_id': uploader_id,
6360 'internal_id': status_id,
6361 'upload_date': upload_date