2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Full name of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
105 class YoutubeIE(InfoExtractor):
106 """Information extractor for youtube.com."""
110 (?:https?://)? # http(s):// (optional)
111 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
112 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
113 (?:.*?\#/)? # handle anchor (#/) redirect urls
114 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
115 (?: # the various things that can precede the ID:
116 (?:(?:v|embed|e)/) # v/ or embed/ or e/
117 |(?: # or the v= param in all its forms
118 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
119 (?:\?|\#!?) # the params delimiter ? or # or #!
120 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
123 )? # optional -> youtube.com/xxxx is OK
124 )? # all until now is optional -> you can pass the naked ID
125 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
126 (?(1).+)? # if we found the ID, everything can follow
128 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
129 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
130 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
131 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
132 _NETRC_MACHINE = 'youtube'
133 # Listed in order of quality
134 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
135 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
136 _video_extensions = {
142 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
148 _video_dimensions = {
166 def suitable(self, url):
167 """Receives a URL and returns True if suitable for this IE."""
168 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
170 def report_lang(self):
171 """Report attempt to set language."""
172 self._downloader.to_screen(u'[youtube] Setting language')
174 def report_login(self):
175 """Report attempt to log in."""
176 self._downloader.to_screen(u'[youtube] Logging in')
178 def report_age_confirmation(self):
179 """Report attempt to confirm age."""
180 self._downloader.to_screen(u'[youtube] Confirming age')
182 def report_video_webpage_download(self, video_id):
183 """Report attempt to download video webpage."""
184 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
186 def report_video_info_webpage_download(self, video_id):
187 """Report attempt to download video info webpage."""
188 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
190 def report_video_subtitles_download(self, video_id):
191 """Report attempt to download video info webpage."""
192 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
194 def report_information_extraction(self, video_id):
195 """Report attempt to extract video information."""
196 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
198 def report_unavailable_format(self, video_id, format):
199 """Report extracted video URL."""
200 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
202 def report_rtmp_download(self):
203 """Indicate the download will use the RTMP protocol."""
204 self._downloader.to_screen(u'[youtube] RTMP download detected')
206 def _closed_captions_xml_to_srt(self, xml_string):
208 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
209 # TODO parse xml instead of regex
210 for n, (start, dur_tag, dur, caption) in enumerate(texts):
211 if not dur: dur = '4'
213 end = start + float(dur)
214 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
215 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
216 caption = unescapeHTML(caption)
217 caption = unescapeHTML(caption) # double cycle, intentional
218 srt += str(n+1) + '\n'
219 srt += start + ' --> ' + end + '\n'
220 srt += caption + '\n\n'
223 def _extract_subtitles(self, video_id):
224 self.report_video_subtitles_download(video_id)
225 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
227 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
228 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
229 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
230 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
231 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
232 if not srt_lang_list:
233 return (u'WARNING: video has no closed captions', None)
234 if self._downloader.params.get('subtitleslang', False):
235 srt_lang = self._downloader.params.get('subtitleslang')
236 elif 'en' in srt_lang_list:
239 srt_lang = list(srt_lang_list.keys())[0]
240 if not srt_lang in srt_lang_list:
241 return (u'WARNING: no closed captions found in the specified language', None)
242 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
244 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
245 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
246 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
248 return (u'WARNING: unable to download video subtitles', None)
249 return (None, self._closed_captions_xml_to_srt(srt_xml))
251 def _print_formats(self, formats):
252 print('Available formats:')
254 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
256 def _real_initialize(self):
257 if self._downloader is None:
262 downloader_params = self._downloader.params
264 # Attempt to use provided username and password or .netrc data
265 if downloader_params.get('username', None) is not None:
266 username = downloader_params['username']
267 password = downloader_params['password']
268 elif downloader_params.get('usenetrc', False):
270 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
275 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
276 except (IOError, netrc.NetrcParseError) as err:
277 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
281 request = compat_urllib_request.Request(self._LANG_URL)
284 compat_urllib_request.urlopen(request).read()
285 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
286 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
289 # No authentication to be performed
295 'current_form': 'loginForm',
297 'action_login': 'Log In',
298 'username': username,
299 'password': password,
301 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
304 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
305 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
306 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
309 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
315 'action_confirm': 'Confirm',
317 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
319 self.report_age_confirmation()
320 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
322 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
325 def _real_extract(self, url):
326 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
327 mobj = re.search(self._NEXT_URL_RE, url)
329 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
331 # Extract video id from URL
332 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
334 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
336 video_id = mobj.group(2)
339 self.report_video_webpage_download(video_id)
340 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
342 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
347 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
349 # Attempt to extract SWF player URL
350 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
352 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
357 self.report_video_info_webpage_download(video_id)
358 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
359 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
360 % (video_id, el_type))
361 request = compat_urllib_request.Request(video_info_url)
363 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
364 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
365 video_info = compat_parse_qs(video_info_webpage)
366 if 'token' in video_info:
368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
369 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
371 if 'token' not in video_info:
372 if 'reason' in video_info:
373 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
375 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
378 # Check for "rental" videos
379 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
380 self._downloader.trouble(u'ERROR: "rental" videos not supported')
383 # Start extracting information
384 self.report_information_extraction(video_id)
387 if 'author' not in video_info:
388 self._downloader.trouble(u'ERROR: unable to extract uploader name')
390 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
393 video_uploader_id = None
394 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
396 video_uploader_id = mobj.group(1)
398 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
401 if 'title' not in video_info:
402 self._downloader.trouble(u'ERROR: unable to extract video title')
404 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
407 if 'thumbnail_url' not in video_info:
408 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
410 else: # don't panic if we can't find it
411 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
415 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
417 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
418 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
419 for expression in format_expressions:
421 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
426 video_description = get_element_by_id("eow-description", video_webpage)
427 if video_description:
428 video_description = clean_html(video_description)
430 video_description = ''
433 video_subtitles = None
434 if self._downloader.params.get('writesubtitles', False):
435 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
437 self._downloader.trouble(srt_error)
439 if 'length_seconds' not in video_info:
440 self._downloader.trouble(u'WARNING: unable to extract video duration')
443 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
446 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
448 # Decide which formats to download
449 req_format = self._downloader.params.get('format', None)
451 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
452 self.report_rtmp_download()
453 video_url_list = [(None, video_info['conn'][0])]
454 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
455 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
456 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
457 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
458 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
460 format_limit = self._downloader.params.get('format_limit', None)
461 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
462 if format_limit is not None and format_limit in available_formats:
463 format_list = available_formats[available_formats.index(format_limit):]
465 format_list = available_formats
466 existing_formats = [x for x in format_list if x in url_map]
467 if len(existing_formats) == 0:
468 self._downloader.trouble(u'ERROR: no known formats available for video')
470 if self._downloader.params.get('listformats', None):
471 self._print_formats(existing_formats)
473 if req_format is None or req_format == 'best':
474 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
475 elif req_format == 'worst':
476 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
477 elif req_format in ('-1', 'all'):
478 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
480 # Specific formats. We pick the first in a slash-delimeted sequence.
481 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
482 req_formats = req_format.split('/')
483 video_url_list = None
484 for rf in req_formats:
486 video_url_list = [(rf, url_map[rf])]
488 if video_url_list is None:
489 self._downloader.trouble(u'ERROR: requested format not available')
492 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
496 for format_param, video_real_url in video_url_list:
498 video_extension = self._video_extensions.get(format_param, 'flv')
500 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
501 self._video_dimensions.get(format_param, '???'))
505 'url': video_real_url,
506 'uploader': video_uploader,
507 'uploader_id': video_uploader_id,
508 'upload_date': upload_date,
509 'title': video_title,
510 'ext': video_extension,
511 'format': video_format,
512 'thumbnail': video_thumbnail,
513 'description': video_description,
514 'player_url': player_url,
515 'subtitles': video_subtitles,
516 'duration': video_duration
521 class MetacafeIE(InfoExtractor):
522 """Information Extractor for metacafe.com."""
524 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
525 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
526 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
527 IE_NAME = u'metacafe'
529 def __init__(self, downloader=None):
530 InfoExtractor.__init__(self, downloader)
532 def report_disclaimer(self):
533 """Report disclaimer retrieval."""
534 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
536 def report_age_confirmation(self):
537 """Report attempt to confirm age."""
538 self._downloader.to_screen(u'[metacafe] Confirming age')
540 def report_download_webpage(self, video_id):
541 """Report webpage download."""
542 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
544 def report_extraction(self, video_id):
545 """Report information extraction."""
546 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
548 def _real_initialize(self):
549 # Retrieve disclaimer
550 request = compat_urllib_request.Request(self._DISCLAIMER)
552 self.report_disclaimer()
553 disclaimer = compat_urllib_request.urlopen(request).read()
554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
555 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
561 'submit': "Continue - I'm over 18",
563 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
565 self.report_age_confirmation()
566 disclaimer = compat_urllib_request.urlopen(request).read()
567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
568 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
571 def _real_extract(self, url):
572 # Extract id and simplified title from URL
573 mobj = re.match(self._VALID_URL, url)
575 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
578 video_id = mobj.group(1)
580 # Check if video comes from YouTube
581 mobj2 = re.match(r'^yt-(.*)$', video_id)
582 if mobj2 is not None:
583 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
586 # Retrieve video webpage to extract further information
587 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
589 self.report_download_webpage(video_id)
590 webpage = compat_urllib_request.urlopen(request).read()
591 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
592 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
595 # Extract URL, uploader and title from webpage
596 self.report_extraction(video_id)
597 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
599 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
600 video_extension = mediaURL[-3:]
602 # Extract gdaKey if available
603 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
607 gdaKey = mobj.group(1)
608 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
610 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
612 self._downloader.trouble(u'ERROR: unable to extract media URL')
614 vardict = compat_parse_qs(mobj.group(1))
615 if 'mediaData' not in vardict:
616 self._downloader.trouble(u'ERROR: unable to extract media URL')
618 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
620 self._downloader.trouble(u'ERROR: unable to extract media URL')
622 mediaURL = mobj.group(1).replace('\\/', '/')
623 video_extension = mediaURL[-3:]
624 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
626 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
628 self._downloader.trouble(u'ERROR: unable to extract title')
630 video_title = mobj.group(1).decode('utf-8')
632 mobj = re.search(r'submitter=(.*?);', webpage)
634 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
636 video_uploader = mobj.group(1)
639 'id': video_id.decode('utf-8'),
640 'url': video_url.decode('utf-8'),
641 'uploader': video_uploader.decode('utf-8'),
643 'title': video_title,
644 'ext': video_extension.decode('utf-8'),
648 class DailymotionIE(InfoExtractor):
649 """Information Extractor for Dailymotion"""
651 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
652 IE_NAME = u'dailymotion'
654 def __init__(self, downloader=None):
655 InfoExtractor.__init__(self, downloader)
657 def report_download_webpage(self, video_id):
658 """Report webpage download."""
659 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
661 def report_extraction(self, video_id):
662 """Report information extraction."""
663 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
665 def _real_extract(self, url):
666 # Extract id and simplified title from URL
667 mobj = re.match(self._VALID_URL, url)
669 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
672 video_id = mobj.group(1).split('_')[0].split('?')[0]
674 video_extension = 'mp4'
676 # Retrieve video webpage to extract further information
677 request = compat_urllib_request.Request(url)
678 request.add_header('Cookie', 'family_filter=off')
680 self.report_download_webpage(video_id)
681 webpage_bytes = compat_urllib_request.urlopen(request).read()
682 webpage = webpage_bytes.decode('utf-8')
683 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
684 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
687 # Extract URL, uploader and title from webpage
688 self.report_extraction(video_id)
689 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
691 self._downloader.trouble(u'ERROR: unable to extract media URL')
693 flashvars = compat_urllib_parse.unquote(mobj.group(1))
695 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
698 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
701 self._downloader.trouble(u'ERROR: unable to extract video URL')
704 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
706 self._downloader.trouble(u'ERROR: unable to extract video URL')
709 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
711 # TODO: support choosing qualities
713 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
715 self._downloader.trouble(u'ERROR: unable to extract title')
717 video_title = unescapeHTML(mobj.group('title'))
719 video_uploader = None
720 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
722 # lookin for official user
723 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
724 if mobj_official is None:
725 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
727 video_uploader = mobj_official.group(1)
729 video_uploader = mobj.group(1)
731 video_upload_date = None
732 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
734 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
739 'uploader': video_uploader,
740 'upload_date': video_upload_date,
741 'title': video_title,
742 'ext': video_extension,
746 class PhotobucketIE(InfoExtractor):
747 """Information extractor for photobucket.com."""
749 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
750 IE_NAME = u'photobucket'
752 def __init__(self, downloader=None):
753 InfoExtractor.__init__(self, downloader)
755 def report_download_webpage(self, video_id):
756 """Report webpage download."""
757 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
759 def report_extraction(self, video_id):
760 """Report information extraction."""
761 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
763 def _real_extract(self, url):
764 # Extract id from URL
765 mobj = re.match(self._VALID_URL, url)
767 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
770 video_id = mobj.group(1)
772 video_extension = 'flv'
774 # Retrieve video webpage to extract further information
775 request = compat_urllib_request.Request(url)
777 self.report_download_webpage(video_id)
778 webpage = compat_urllib_request.urlopen(request).read()
779 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
780 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
783 # Extract URL, uploader, and title from webpage
784 self.report_extraction(video_id)
785 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
787 self._downloader.trouble(u'ERROR: unable to extract media URL')
789 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
793 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
795 self._downloader.trouble(u'ERROR: unable to extract title')
797 video_title = mobj.group(1).decode('utf-8')
799 video_uploader = mobj.group(2).decode('utf-8')
802 'id': video_id.decode('utf-8'),
803 'url': video_url.decode('utf-8'),
804 'uploader': video_uploader,
806 'title': video_title,
807 'ext': video_extension.decode('utf-8'),
811 class YahooIE(InfoExtractor):
812 """Information extractor for video.yahoo.com."""
815 # _VALID_URL matches all Yahoo! Video URLs
816 # _VPAGE_URL matches only the extractable '/watch/' URLs
817 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
818 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
819 IE_NAME = u'video.yahoo'
821 def __init__(self, downloader=None):
822 InfoExtractor.__init__(self, downloader)
824 def report_download_webpage(self, video_id):
825 """Report webpage download."""
826 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
828 def report_extraction(self, video_id):
829 """Report information extraction."""
830 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
832 def _real_extract(self, url, new_video=True):
833 # Extract ID from URL
834 mobj = re.match(self._VALID_URL, url)
836 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
839 video_id = mobj.group(2)
840 video_extension = 'flv'
842 # Rewrite valid but non-extractable URLs as
843 # extractable English language /watch/ URLs
844 if re.match(self._VPAGE_URL, url) is None:
845 request = compat_urllib_request.Request(url)
847 webpage = compat_urllib_request.urlopen(request).read()
848 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
849 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
852 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
854 self._downloader.trouble(u'ERROR: Unable to extract id field')
856 yahoo_id = mobj.group(1)
858 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
860 self._downloader.trouble(u'ERROR: Unable to extract vid field')
862 yahoo_vid = mobj.group(1)
864 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
865 return self._real_extract(url, new_video=False)
867 # Retrieve video webpage to extract further information
868 request = compat_urllib_request.Request(url)
870 self.report_download_webpage(video_id)
871 webpage = compat_urllib_request.urlopen(request).read()
872 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
873 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
876 # Extract uploader and title from webpage
877 self.report_extraction(video_id)
878 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
880 self._downloader.trouble(u'ERROR: unable to extract video title')
882 video_title = mobj.group(1).decode('utf-8')
884 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
886 self._downloader.trouble(u'ERROR: unable to extract video uploader')
888 video_uploader = mobj.group(1).decode('utf-8')
890 # Extract video thumbnail
891 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
893 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
895 video_thumbnail = mobj.group(1).decode('utf-8')
897 # Extract video description
898 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
900 self._downloader.trouble(u'ERROR: unable to extract video description')
902 video_description = mobj.group(1).decode('utf-8')
903 if not video_description:
904 video_description = 'No description available.'
906 # Extract video height and width
907 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
909 self._downloader.trouble(u'ERROR: unable to extract video height')
911 yv_video_height = mobj.group(1)
913 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
915 self._downloader.trouble(u'ERROR: unable to extract video width')
917 yv_video_width = mobj.group(1)
919 # Retrieve video playlist to extract media URL
920 # I'm not completely sure what all these options are, but we
921 # seem to need most of them, otherwise the server sends a 401.
922 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
923 yv_bitrate = '700' # according to Wikipedia this is hard-coded
924 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
925 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
926 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
928 self.report_download_webpage(video_id)
929 webpage = compat_urllib_request.urlopen(request).read()
930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
931 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
934 # Extract media URL from playlist XML
935 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
937 self._downloader.trouble(u'ERROR: Unable to extract media URL')
939 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
940 video_url = unescapeHTML(video_url)
943 'id': video_id.decode('utf-8'),
945 'uploader': video_uploader,
947 'title': video_title,
948 'ext': video_extension.decode('utf-8'),
949 'thumbnail': video_thumbnail.decode('utf-8'),
950 'description': video_description,
954 class VimeoIE(InfoExtractor):
955 """Information extractor for vimeo.com."""
957 # _VALID_URL matches Vimeo URLs
958 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
961 def __init__(self, downloader=None):
962 InfoExtractor.__init__(self, downloader)
964 def report_download_webpage(self, video_id):
965 """Report webpage download."""
966 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
968 def report_extraction(self, video_id):
969 """Report information extraction."""
970 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
972 def _real_extract(self, url, new_video=True):
973 # Extract ID from URL
974 mobj = re.match(self._VALID_URL, url)
976 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
979 video_id = mobj.group(1)
981 # Retrieve video webpage to extract further information
982 request = compat_urllib_request.Request(url, None, std_headers)
984 self.report_download_webpage(video_id)
985 webpage_bytes = compat_urllib_request.urlopen(request).read()
986 webpage = webpage_bytes.decode('utf-8')
987 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
988 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
991 # Now we begin extracting as much information as we can from what we
992 # retrieved. First we extract the information common to all extractors,
993 # and latter we extract those that are Vimeo specific.
994 self.report_extraction(video_id)
996 # Extract the config JSON
998 config = webpage.split(' = {config:')[1].split(',assets:')[0]
999 config = json.loads(config)
1001 self._downloader.trouble(u'ERROR: unable to extract info section')
1005 video_title = config["video"]["title"]
1007 # Extract uploader and uploader_id
1008 video_uploader = config["video"]["owner"]["name"]
1009 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1011 # Extract video thumbnail
1012 video_thumbnail = config["video"]["thumbnail"]
1014 # Extract video description
1015 video_description = get_element_by_attribute("itemprop", "description", webpage)
1016 if video_description: video_description = clean_html(video_description)
1017 else: video_description = ''
1019 # Extract upload date
1020 video_upload_date = None
1021 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1022 if mobj is not None:
1023 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1025 # Vimeo specific: extract request signature and timestamp
1026 sig = config['request']['signature']
1027 timestamp = config['request']['timestamp']
1029 # Vimeo specific: extract video codec and quality information
1030 # First consider quality, then codecs, then take everything
1031 # TODO bind to format param
1032 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1033 files = { 'hd': [], 'sd': [], 'other': []}
1034 for codec_name, codec_extension in codecs:
1035 if codec_name in config["video"]["files"]:
1036 if 'hd' in config["video"]["files"][codec_name]:
1037 files['hd'].append((codec_name, codec_extension, 'hd'))
1038 elif 'sd' in config["video"]["files"][codec_name]:
1039 files['sd'].append((codec_name, codec_extension, 'sd'))
1041 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1043 for quality in ('hd', 'sd', 'other'):
1044 if len(files[quality]) > 0:
1045 video_quality = files[quality][0][2]
1046 video_codec = files[quality][0][0]
1047 video_extension = files[quality][0][1]
1048 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1051 self._downloader.trouble(u'ERROR: no known codec found')
1054 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1055 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1060 'uploader': video_uploader,
1061 'uploader_id': video_uploader_id,
1062 'upload_date': video_upload_date,
1063 'title': video_title,
1064 'ext': video_extension,
1065 'thumbnail': video_thumbnail,
1066 'description': video_description,
1070 class ArteTvIE(InfoExtractor):
1071 """arte.tv information extractor."""
1073 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1074 _LIVE_URL = r'index-[0-9]+\.html$'
1076 IE_NAME = u'arte.tv'
1078 def __init__(self, downloader=None):
1079 InfoExtractor.__init__(self, downloader)
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1089 def fetch_webpage(self, url):
1090 self._downloader.increment_downloads()
1091 request = compat_urllib_request.Request(url)
1093 self.report_download_webpage(url)
1094 webpage = compat_urllib_request.urlopen(request).read()
1095 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1096 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1098 except ValueError as err:
1099 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1103 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1104 page = self.fetch_webpage(url)
1105 mobj = re.search(regex, page, regexFlags)
1109 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1112 for (i, key, err) in matchTuples:
1113 if mobj.group(i) is None:
1114 self._downloader.trouble(err)
1117 info[key] = mobj.group(i)
1121 def extractLiveStream(self, url):
1122 video_lang = url.split('/')[-4]
1123 info = self.grep_webpage(
1125 r'src="(.*?/videothek_js.*?\.js)',
1128 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1131 http_host = url.split('/')[2]
1132 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1133 info = self.grep_webpage(
1135 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1136 '(http://.*?\.swf).*?' +
1140 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1141 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1142 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1145 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1147 def extractPlus7Stream(self, url):
1148 video_lang = url.split('/')[-3]
1149 info = self.grep_webpage(
1151 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1154 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1157 next_url = compat_urllib_parse.unquote(info.get('url'))
1158 info = self.grep_webpage(
1160 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1163 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1166 next_url = compat_urllib_parse.unquote(info.get('url'))
1168 info = self.grep_webpage(
1170 r'<video id="(.*?)".*?>.*?' +
1171 '<name>(.*?)</name>.*?' +
1172 '<dateVideo>(.*?)</dateVideo>.*?' +
1173 '<url quality="hd">(.*?)</url>',
1176 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1177 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1178 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1179 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1184 'id': info.get('id'),
1185 'url': compat_urllib_parse.unquote(info.get('url')),
1186 'uploader': u'arte.tv',
1187 'upload_date': info.get('date'),
1188 'title': info.get('title').decode('utf-8'),
1194 def _real_extract(self, url):
1195 video_id = url.split('/')[-1]
1196 self.report_extraction(video_id)
1198 if re.search(self._LIVE_URL, video_id) is not None:
1199 self.extractLiveStream(url)
1202 info = self.extractPlus7Stream(url)
1207 class GenericIE(InfoExtractor):
1208 """Generic last-resort information extractor."""
1211 IE_NAME = u'generic'
1213 def __init__(self, downloader=None):
1214 InfoExtractor.__init__(self, downloader)
1216 def report_download_webpage(self, video_id):
1217 """Report webpage download."""
1218 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1219 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1221 def report_extraction(self, video_id):
1222 """Report information extraction."""
1223 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1225 def report_following_redirect(self, new_url):
1226 """Report information extraction."""
1227 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1229 def _test_redirect(self, url):
1230 """Check if it is a redirect, like url shorteners, in case restart chain."""
1231 class HeadRequest(compat_urllib_request.Request):
1232 def get_method(self):
1235 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1237 Subclass the HTTPRedirectHandler to make it use our
1238 HeadRequest also on the redirected URL
1240 def redirect_request(self, req, fp, code, msg, headers, newurl):
1241 if code in (301, 302, 303, 307):
1242 newurl = newurl.replace(' ', '%20')
1243 newheaders = dict((k,v) for k,v in req.headers.items()
1244 if k.lower() not in ("content-length", "content-type"))
1245 return HeadRequest(newurl,
1247 origin_req_host=req.get_origin_req_host(),
1250 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1252 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1254 Fallback to GET if HEAD is not allowed (405 HTTP error)
1256 def http_error_405(self, req, fp, code, msg, headers):
1260 newheaders = dict((k,v) for k,v in req.headers.items()
1261 if k.lower() not in ("content-length", "content-type"))
1262 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1264 origin_req_host=req.get_origin_req_host(),
1268 opener = compat_urllib_request.OpenerDirector()
1269 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1270 HTTPMethodFallback, HEADRedirectHandler,
1271 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1272 opener.add_handler(handler())
1274 response = opener.open(HeadRequest(url))
1275 new_url = response.geturl()
1280 self.report_following_redirect(new_url)
1281 self._downloader.download([new_url])
1284 def _real_extract(self, url):
1285 if self._test_redirect(url): return
1287 video_id = url.split('/')[-1]
1288 request = compat_urllib_request.Request(url)
1290 self.report_download_webpage(video_id)
1291 webpage = compat_urllib_request.urlopen(request).read()
1292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1293 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1295 except ValueError as err:
1296 # since this is the last-resort InfoExtractor, if
1297 # this error is thrown, it'll be thrown here
1298 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1301 self.report_extraction(video_id)
1302 # Start with something easy: JW Player in SWFObject
1303 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1305 # Broaden the search a little bit
1306 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1308 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1311 # It's possible that one of the regexes
1312 # matched, but returned an empty group:
1313 if mobj.group(1) is None:
1314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1317 video_url = compat_urllib_parse.unquote(mobj.group(1))
1318 video_id = os.path.basename(video_url)
1320 # here's a fun little line of code for you:
1321 video_extension = os.path.splitext(video_id)[1][1:]
1322 video_id = os.path.splitext(video_id)[0]
1324 # it's tempting to parse this further, but you would
1325 # have to take into account all the variations like
1326 # Video Title - Site Name
1327 # Site Name | Video Title
1328 # Video Title - Tagline | Site Name
1329 # and so on and so forth; it's just not practical
1330 mobj = re.search(r'<title>(.*)</title>', webpage)
1332 self._downloader.trouble(u'ERROR: unable to extract title')
1334 video_title = mobj.group(1)
1336 # video uploader is domain name
1337 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1339 self._downloader.trouble(u'ERROR: unable to extract title')
1341 video_uploader = mobj.group(1)
1346 'uploader': video_uploader,
1347 'upload_date': None,
1348 'title': video_title,
1349 'ext': video_extension,
1353 class YoutubeSearchIE(InfoExtractor):
1354 """Information Extractor for YouTube search queries."""
1355 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1356 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1357 _max_youtube_results = 1000
1358 IE_NAME = u'youtube:search'
1360 def __init__(self, downloader=None):
1361 InfoExtractor.__init__(self, downloader)
1363 def report_download_page(self, query, pagenum):
1364 """Report attempt to download search page with given number."""
1365 query = query.decode(preferredencoding())
1366 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1368 def _real_extract(self, query):
1369 mobj = re.match(self._VALID_URL, query)
1371 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1374 prefix, query = query.split(':')
1376 query = query.encode('utf-8')
1378 self._download_n_results(query, 1)
1380 elif prefix == 'all':
1381 self._download_n_results(query, self._max_youtube_results)
1387 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1389 elif n > self._max_youtube_results:
1390 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1391 n = self._max_youtube_results
1392 self._download_n_results(query, n)
1394 except ValueError: # parsing prefix as integer fails
1395 self._download_n_results(query, 1)
1398 def _download_n_results(self, query, n):
1399 """Downloads a specified number of results for a query"""
1405 while (50 * pagenum) < limit:
1406 self.report_download_page(query, pagenum+1)
1407 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1408 request = compat_urllib_request.Request(result_url)
1410 data = compat_urllib_request.urlopen(request).read()
1411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1412 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1414 api_response = json.loads(data)['data']
1416 new_ids = list(video['id'] for video in api_response['items'])
1417 video_ids += new_ids
1419 limit = min(n, api_response['totalItems'])
1422 if len(video_ids) > n:
1423 video_ids = video_ids[:n]
1424 for id in video_ids:
1425 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1429 class GoogleSearchIE(InfoExtractor):
1430 """Information Extractor for Google Video search queries."""
1431 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1432 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1433 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1434 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1435 _max_google_results = 1000
1436 IE_NAME = u'video.google:search'
1438 def __init__(self, downloader=None):
1439 InfoExtractor.__init__(self, downloader)
1441 def report_download_page(self, query, pagenum):
1442 """Report attempt to download playlist page with given number."""
1443 query = query.decode(preferredencoding())
1444 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1446 def _real_extract(self, query):
1447 mobj = re.match(self._VALID_URL, query)
1449 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1452 prefix, query = query.split(':')
1454 query = query.encode('utf-8')
1456 self._download_n_results(query, 1)
1458 elif prefix == 'all':
1459 self._download_n_results(query, self._max_google_results)
1465 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1467 elif n > self._max_google_results:
1468 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1469 n = self._max_google_results
1470 self._download_n_results(query, n)
1472 except ValueError: # parsing prefix as integer fails
1473 self._download_n_results(query, 1)
1476 def _download_n_results(self, query, n):
1477 """Downloads a specified number of results for a query"""
1483 self.report_download_page(query, pagenum)
1484 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1485 request = compat_urllib_request.Request(result_url)
1487 page = compat_urllib_request.urlopen(request).read()
1488 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1489 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1492 # Extract video identifiers
1493 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1494 video_id = mobj.group(1)
1495 if video_id not in video_ids:
1496 video_ids.append(video_id)
1497 if len(video_ids) == n:
1498 # Specified n videos reached
1499 for id in video_ids:
1500 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1503 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1504 for id in video_ids:
1505 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1508 pagenum = pagenum + 1
1511 class YahooSearchIE(InfoExtractor):
1512 """Information Extractor for Yahoo! Video search queries."""
1515 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1516 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1517 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1518 _MORE_PAGES_INDICATOR = r'\s*Next'
1519 _max_yahoo_results = 1000
1520 IE_NAME = u'video.yahoo:search'
1522 def __init__(self, downloader=None):
1523 InfoExtractor.__init__(self, downloader)
1525 def report_download_page(self, query, pagenum):
1526 """Report attempt to download playlist page with given number."""
1527 query = query.decode(preferredencoding())
1528 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1530 def _real_extract(self, query):
1531 mobj = re.match(self._VALID_URL, query)
1533 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1536 prefix, query = query.split(':')
1538 query = query.encode('utf-8')
1540 self._download_n_results(query, 1)
1542 elif prefix == 'all':
1543 self._download_n_results(query, self._max_yahoo_results)
1549 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1551 elif n > self._max_yahoo_results:
1552 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1553 n = self._max_yahoo_results
1554 self._download_n_results(query, n)
1556 except ValueError: # parsing prefix as integer fails
1557 self._download_n_results(query, 1)
1560 def _download_n_results(self, query, n):
1561 """Downloads a specified number of results for a query"""
1564 already_seen = set()
1568 self.report_download_page(query, pagenum)
1569 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1570 request = compat_urllib_request.Request(result_url)
1572 page = compat_urllib_request.urlopen(request).read()
1573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1574 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1577 # Extract video identifiers
1578 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1579 video_id = mobj.group(1)
1580 if video_id not in already_seen:
1581 video_ids.append(video_id)
1582 already_seen.add(video_id)
1583 if len(video_ids) == n:
1584 # Specified n videos reached
1585 for id in video_ids:
1586 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1589 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1590 for id in video_ids:
1591 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1594 pagenum = pagenum + 1
1597 class YoutubePlaylistIE(InfoExtractor):
1598 """Information Extractor for YouTube playlists."""
1600 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1601 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1602 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1603 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1604 IE_NAME = u'youtube:playlist'
1606 def __init__(self, downloader=None):
1607 InfoExtractor.__init__(self, downloader)
1609 def report_download_page(self, playlist_id, pagenum):
1610 """Report attempt to download playlist page with given number."""
1611 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1613 def _real_extract(self, url):
1614 # Extract playlist id
1615 mobj = re.match(self._VALID_URL, url)
1617 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1621 if mobj.group(3) is not None:
1622 self._downloader.download([mobj.group(3)])
1625 # Download playlist pages
1626 # prefix is 'p' as default for playlists but there are other types that need extra care
1627 playlist_prefix = mobj.group(1)
1628 if playlist_prefix == 'a':
1629 playlist_access = 'artist'
1631 playlist_prefix = 'p'
1632 playlist_access = 'view_play_list'
1633 playlist_id = mobj.group(2)
1638 self.report_download_page(playlist_id, pagenum)
1639 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1640 request = compat_urllib_request.Request(url)
1642 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1644 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1647 # Extract video identifiers
1649 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1650 if mobj.group(1) not in ids_in_page:
1651 ids_in_page.append(mobj.group(1))
1652 video_ids.extend(ids_in_page)
1654 if self._MORE_PAGES_INDICATOR not in page:
1656 pagenum = pagenum + 1
1658 total = len(video_ids)
1660 playliststart = self._downloader.params.get('playliststart', 1) - 1
1661 playlistend = self._downloader.params.get('playlistend', -1)
1662 if playlistend == -1:
1663 video_ids = video_ids[playliststart:]
1665 video_ids = video_ids[playliststart:playlistend]
1667 if len(video_ids) == total:
1668 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1670 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1672 for id in video_ids:
1673 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1677 class YoutubeChannelIE(InfoExtractor):
1678 """Information Extractor for YouTube channels."""
1680 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1681 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1682 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1683 IE_NAME = u'youtube:channel'
1685 def report_download_page(self, channel_id, pagenum):
1686 """Report attempt to download channel page with given number."""
1687 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1689 def _real_extract(self, url):
1690 # Extract channel id
1691 mobj = re.match(self._VALID_URL, url)
1693 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1696 # Download channel pages
1697 channel_id = mobj.group(1)
1702 self.report_download_page(channel_id, pagenum)
1703 url = self._TEMPLATE_URL % (channel_id, pagenum)
1704 request = compat_urllib_request.Request(url)
1706 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1711 # Extract video identifiers
1713 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1714 if mobj.group(1) not in ids_in_page:
1715 ids_in_page.append(mobj.group(1))
1716 video_ids.extend(ids_in_page)
1718 if self._MORE_PAGES_INDICATOR not in page:
1720 pagenum = pagenum + 1
1722 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1724 for id in video_ids:
1725 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1729 class YoutubeUserIE(InfoExtractor):
1730 """Information Extractor for YouTube users."""
1732 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1733 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1734 _GDATA_PAGE_SIZE = 50
1735 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1736 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1737 IE_NAME = u'youtube:user'
1739 def __init__(self, downloader=None):
1740 InfoExtractor.__init__(self, downloader)
1742 def report_download_page(self, username, start_index):
1743 """Report attempt to download user page."""
1744 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1745 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1747 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1751 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1754 username = mobj.group(1)
1756 # Download video ids using YouTube Data API. Result size per
1757 # query is limited (currently to 50 videos) so we need to query
1758 # page by page until there are no video ids - it means we got
1765 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1766 self.report_download_page(username, start_index)
1768 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1771 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1772 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776 # Extract video identifiers
1779 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1780 if mobj.group(1) not in ids_in_page:
1781 ids_in_page.append(mobj.group(1))
1783 video_ids.extend(ids_in_page)
1785 # A little optimization - if current page is not
1786 # "full", ie. does not contain PAGE_SIZE video ids then
1787 # we can assume that this page is the last one - there
1788 # are no more ids on further pages - no need to query
1791 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1796 all_ids_count = len(video_ids)
1797 playliststart = self._downloader.params.get('playliststart', 1) - 1
1798 playlistend = self._downloader.params.get('playlistend', -1)
1800 if playlistend == -1:
1801 video_ids = video_ids[playliststart:]
1803 video_ids = video_ids[playliststart:playlistend]
1805 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1806 (username, all_ids_count, len(video_ids)))
1808 for video_id in video_ids:
1809 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1812 class BlipTVUserIE(InfoExtractor):
1813 """Information Extractor for blip.tv users."""
1815 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1817 IE_NAME = u'blip.tv:user'
1819 def __init__(self, downloader=None):
1820 InfoExtractor.__init__(self, downloader)
1822 def report_download_page(self, username, pagenum):
1823 """Report attempt to download user page."""
1824 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1825 (self.IE_NAME, username, pagenum))
1827 def _real_extract(self, url):
1829 mobj = re.match(self._VALID_URL, url)
1831 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1834 username = mobj.group(1)
1836 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1838 request = compat_urllib_request.Request(url)
1841 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1842 mobj = re.search(r'data-users-id="([^"]+)"', page)
1843 page_base = page_base % mobj.group(1)
1844 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1849 # Download video ids using BlipTV Ajax calls. Result size per
1850 # query is limited (currently to 12 videos) so we need to query
1851 # page by page until there are no video ids - it means we got
1858 self.report_download_page(username, pagenum)
1860 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1863 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1865 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1868 # Extract video identifiers
1871 for mobj in re.finditer(r'href="/([^"]+)"', page):
1872 if mobj.group(1) not in ids_in_page:
1873 ids_in_page.append(unescapeHTML(mobj.group(1)))
1875 video_ids.extend(ids_in_page)
1877 # A little optimization - if current page is not
1878 # "full", ie. does not contain PAGE_SIZE video ids then
1879 # we can assume that this page is the last one - there
1880 # are no more ids on further pages - no need to query
1883 if len(ids_in_page) < self._PAGE_SIZE:
1888 all_ids_count = len(video_ids)
1889 playliststart = self._downloader.params.get('playliststart', 1) - 1
1890 playlistend = self._downloader.params.get('playlistend', -1)
1892 if playlistend == -1:
1893 video_ids = video_ids[playliststart:]
1895 video_ids = video_ids[playliststart:playlistend]
1897 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1898 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1900 for video_id in video_ids:
1901 self._downloader.download([u'http://blip.tv/'+video_id])
1904 class DepositFilesIE(InfoExtractor):
1905 """Information extractor for depositfiles.com"""
1907 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1908 IE_NAME = u'DepositFiles'
1910 def __init__(self, downloader=None):
1911 InfoExtractor.__init__(self, downloader)
1913 def report_download_webpage(self, file_id):
1914 """Report webpage download."""
1915 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1917 def report_extraction(self, file_id):
1918 """Report information extraction."""
1919 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1921 def _real_extract(self, url):
1922 file_id = url.split('/')[-1]
1923 # Rebuild url in english locale
1924 url = 'http://depositfiles.com/en/files/' + file_id
1926 # Retrieve file webpage with 'Free download' button pressed
1927 free_download_indication = { 'gateway_result' : '1' }
1928 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1930 self.report_download_webpage(file_id)
1931 webpage = compat_urllib_request.urlopen(request).read()
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1936 # Search for the real file URL
1937 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1938 if (mobj is None) or (mobj.group(1) is None):
1939 # Try to figure out reason of the error.
1940 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1941 if (mobj is not None) and (mobj.group(1) is not None):
1942 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1943 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1945 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1948 file_url = mobj.group(1)
1949 file_extension = os.path.splitext(file_url)[1][1:]
1951 # Search for file title
1952 mobj = re.search(r'<b title="(.*?)">', webpage)
1954 self._downloader.trouble(u'ERROR: unable to extract title')
1956 file_title = mobj.group(1).decode('utf-8')
1959 'id': file_id.decode('utf-8'),
1960 'url': file_url.decode('utf-8'),
1962 'upload_date': None,
1963 'title': file_title,
1964 'ext': file_extension.decode('utf-8'),
1968 class FacebookIE(InfoExtractor):
1969 """Information Extractor for Facebook"""
1972 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1973 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1974 _NETRC_MACHINE = 'facebook'
1975 _available_formats = ['video', 'highqual', 'lowqual']
1976 _video_extensions = {
1981 IE_NAME = u'facebook'
1983 def __init__(self, downloader=None):
1984 InfoExtractor.__init__(self, downloader)
1986 def _reporter(self, message):
1987 """Add header and report message."""
1988 self._downloader.to_screen(u'[facebook] %s' % message)
1990 def report_login(self):
1991 """Report attempt to log in."""
1992 self._reporter(u'Logging in')
1994 def report_video_webpage_download(self, video_id):
1995 """Report attempt to download video webpage."""
1996 self._reporter(u'%s: Downloading video webpage' % video_id)
1998 def report_information_extraction(self, video_id):
1999 """Report attempt to extract video information."""
2000 self._reporter(u'%s: Extracting video information' % video_id)
2002 def _parse_page(self, video_webpage):
2003 """Extract video information from page"""
2005 data = {'title': r'\("video_title", "(.*?)"\)',
2006 'description': r'<div class="datawrap">(.*?)</div>',
2007 'owner': r'\("video_owner_name", "(.*?)"\)',
2008 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2011 for piece in data.keys():
2012 mobj = re.search(data[piece], video_webpage)
2013 if mobj is not None:
2014 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2018 for fmt in self._available_formats:
2019 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2020 if mobj is not None:
2021 # URL is in a Javascript segment inside an escaped Unicode format within
2022 # the generally utf-8 page
2023 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2024 video_info['video_urls'] = video_urls
2028 def _real_initialize(self):
2029 if self._downloader is None:
2034 downloader_params = self._downloader.params
2036 # Attempt to use provided username and password or .netrc data
2037 if downloader_params.get('username', None) is not None:
2038 useremail = downloader_params['username']
2039 password = downloader_params['password']
2040 elif downloader_params.get('usenetrc', False):
2042 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2043 if info is not None:
2047 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2048 except (IOError, netrc.NetrcParseError) as err:
2049 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2052 if useremail is None:
2061 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2064 login_results = compat_urllib_request.urlopen(request).read()
2065 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2066 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2072 def _real_extract(self, url):
2073 mobj = re.match(self._VALID_URL, url)
2075 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2077 video_id = mobj.group('ID')
2080 self.report_video_webpage_download(video_id)
2081 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2083 page = compat_urllib_request.urlopen(request)
2084 video_webpage = page.read()
2085 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2086 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2089 # Start extracting information
2090 self.report_information_extraction(video_id)
2092 # Extract information
2093 video_info = self._parse_page(video_webpage)
2096 if 'owner' not in video_info:
2097 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2099 video_uploader = video_info['owner']
2102 if 'title' not in video_info:
2103 self._downloader.trouble(u'ERROR: unable to extract video title')
2105 video_title = video_info['title']
2106 video_title = video_title.decode('utf-8')
2109 if 'thumbnail' not in video_info:
2110 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2111 video_thumbnail = ''
2113 video_thumbnail = video_info['thumbnail']
2117 if 'upload_date' in video_info:
2118 upload_time = video_info['upload_date']
2119 timetuple = email.utils.parsedate_tz(upload_time)
2120 if timetuple is not None:
2122 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2127 video_description = video_info.get('description', 'No description available.')
2129 url_map = video_info['video_urls']
2130 if len(list(url_map.keys())) > 0:
2131 # Decide which formats to download
2132 req_format = self._downloader.params.get('format', None)
2133 format_limit = self._downloader.params.get('format_limit', None)
2135 if format_limit is not None and format_limit in self._available_formats:
2136 format_list = self._available_formats[self._available_formats.index(format_limit):]
2138 format_list = self._available_formats
2139 existing_formats = [x for x in format_list if x in url_map]
2140 if len(existing_formats) == 0:
2141 self._downloader.trouble(u'ERROR: no known formats available for video')
2143 if req_format is None:
2144 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2145 elif req_format == 'worst':
2146 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2147 elif req_format == '-1':
2148 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2151 if req_format not in url_map:
2152 self._downloader.trouble(u'ERROR: requested format not available')
2154 video_url_list = [(req_format, url_map[req_format])] # Specific format
2157 for format_param, video_real_url in video_url_list:
2159 video_extension = self._video_extensions.get(format_param, 'mp4')
2162 'id': video_id.decode('utf-8'),
2163 'url': video_real_url.decode('utf-8'),
2164 'uploader': video_uploader.decode('utf-8'),
2165 'upload_date': upload_date,
2166 'title': video_title,
2167 'ext': video_extension.decode('utf-8'),
2168 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2169 'thumbnail': video_thumbnail.decode('utf-8'),
2170 'description': video_description.decode('utf-8'),
2174 class BlipTVIE(InfoExtractor):
2175 """Information extractor for blip.tv"""
2177 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2178 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2179 IE_NAME = u'blip.tv'
2181 def report_extraction(self, file_id):
2182 """Report information extraction."""
2183 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2185 def report_direct_download(self, title):
2186 """Report information extraction."""
2187 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2189 def _real_extract(self, url):
2190 mobj = re.match(self._VALID_URL, url)
2192 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2199 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2200 request = compat_urllib_request.Request(json_url)
2201 self.report_extraction(mobj.group(1))
2204 urlh = compat_urllib_request.urlopen(request)
2205 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2206 basename = url.split('/')[-1]
2207 title,ext = os.path.splitext(basename)
2208 title = title.decode('UTF-8')
2209 ext = ext.replace('.', '')
2210 self.report_direct_download(title)
2215 'upload_date': None,
2220 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2221 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2223 if info is None: # Regular URL
2225 json_code_bytes = urlh.read()
2226 json_code = json_code_bytes.decode('utf-8')
2227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2228 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2232 json_data = json.loads(json_code)
2233 if 'Post' in json_data:
2234 data = json_data['Post']
2238 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2239 video_url = data['media']['url']
2240 umobj = re.match(self._URL_EXT, video_url)
2242 raise ValueError('Can not determine filename extension')
2243 ext = umobj.group(1)
2246 'id': data['item_id'],
2248 'uploader': data['display_name'],
2249 'upload_date': upload_date,
2250 'title': data['title'],
2252 'format': data['media']['mimeType'],
2253 'thumbnail': data['thumbnailUrl'],
2254 'description': data['description'],
2255 'player_url': data['embedUrl']
2257 except (ValueError,KeyError) as err:
2258 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2261 std_headers['User-Agent'] = 'iTunes/10.6.1'
2265 class MyVideoIE(InfoExtractor):
2266 """Information Extractor for myvideo.de."""
2268 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2269 IE_NAME = u'myvideo'
2271 def __init__(self, downloader=None):
2272 InfoExtractor.__init__(self, downloader)
2274 def report_download_webpage(self, video_id):
2275 """Report webpage download."""
2276 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2278 def report_extraction(self, video_id):
2279 """Report information extraction."""
2280 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2282 def _real_extract(self,url):
2283 mobj = re.match(self._VALID_URL, url)
2285 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2288 video_id = mobj.group(1)
2291 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2293 self.report_download_webpage(video_id)
2294 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2296 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2299 self.report_extraction(video_id)
2300 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2303 self._downloader.trouble(u'ERROR: unable to extract media URL')
2305 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2307 mobj = re.search('<title>([^<]+)</title>', webpage)
2309 self._downloader.trouble(u'ERROR: unable to extract title')
2312 video_title = mobj.group(1)
2318 'upload_date': None,
2319 'title': video_title,
2323 class ComedyCentralIE(InfoExtractor):
2324 """Information extractor for The Daily Show and Colbert Report """
2326 # urls can be abbreviations like :thedailyshow or :colbert
2327 # urls for episodes like:
2328 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2329 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2330 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2331 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2332 |(https?://)?(www\.)?
2333 (?P<showname>thedailyshow|colbertnation)\.com/
2334 (full-episodes/(?P<episode>.*)|
2336 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2337 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2339 IE_NAME = u'comedycentral'
2341 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2343 _video_extensions = {
2351 _video_dimensions = {
2360 def suitable(self, url):
2361 """Receives a URL and returns True if suitable for this IE."""
2362 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2364 def report_extraction(self, episode_id):
2365 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2367 def report_config_download(self, episode_id):
2368 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2370 def report_index_download(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2373 def report_player_url(self, episode_id):
2374 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2377 def _print_formats(self, formats):
2378 print('Available formats:')
2380 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2383 def _real_extract(self, url):
2384 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2386 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2389 if mobj.group('shortname'):
2390 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2391 url = u'http://www.thedailyshow.com/full-episodes/'
2393 url = u'http://www.colbertnation.com/full-episodes/'
2394 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2395 assert mobj is not None
2397 if mobj.group('clip'):
2398 if mobj.group('showname') == 'thedailyshow':
2399 epTitle = mobj.group('tdstitle')
2401 epTitle = mobj.group('cntitle')
2404 dlNewest = not mobj.group('episode')
2406 epTitle = mobj.group('showname')
2408 epTitle = mobj.group('episode')
2410 req = compat_urllib_request.Request(url)
2411 self.report_extraction(epTitle)
2413 htmlHandle = compat_urllib_request.urlopen(req)
2414 html = htmlHandle.read()
2415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2416 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2419 url = htmlHandle.geturl()
2420 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2424 if mobj.group('episode') == '':
2425 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2427 epTitle = mobj.group('episode')
2429 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2431 if len(mMovieParams) == 0:
2432 # The Colbert Report embeds the information in a without
2433 # a URL prefix; so extract the alternate reference
2434 # and then add the URL prefix manually.
2436 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2437 if len(altMovieParams) == 0:
2438 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2441 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2443 playerUrl_raw = mMovieParams[0][0]
2444 self.report_player_url(epTitle)
2446 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2447 playerUrl = urlHandle.geturl()
2448 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2449 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2452 uri = mMovieParams[0][1]
2453 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2454 self.report_index_download(epTitle)
2456 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2458 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2463 idoc = xml.etree.ElementTree.fromstring(indexXml)
2464 itemEls = idoc.findall('.//item')
2465 for itemEl in itemEls:
2466 mediaId = itemEl.findall('./guid')[0].text
2467 shortMediaId = mediaId.split(':')[-1]
2468 showId = mediaId.split(':')[-2].replace('.com', '')
2469 officialTitle = itemEl.findall('./title')[0].text
2470 officialDate = itemEl.findall('./pubDate')[0].text
2472 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2473 compat_urllib_parse.urlencode({'uri': mediaId}))
2474 configReq = compat_urllib_request.Request(configUrl)
2475 self.report_config_download(epTitle)
2477 configXml = compat_urllib_request.urlopen(configReq).read()
2478 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2479 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2482 cdoc = xml.etree.ElementTree.fromstring(configXml)
2484 for rendition in cdoc.findall('.//rendition'):
2485 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2489 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2492 if self._downloader.params.get('listformats', None):
2493 self._print_formats([i[0] for i in turls])
2496 # For now, just pick the highest bitrate
2497 format,video_url = turls[-1]
2499 # Get the format arg from the arg stream
2500 req_format = self._downloader.params.get('format', None)
2502 # Select format if we can find one
2505 format, video_url = f, v
2508 # Patch to download from alternative CDN, which does not
2509 # break on current RTMPDump builds
2510 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2511 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2513 if video_url.startswith(broken_cdn):
2514 video_url = video_url.replace(broken_cdn, better_cdn)
2516 effTitle = showId + u'-' + epTitle
2521 'upload_date': officialDate,
2526 'description': officialTitle,
2527 'player_url': None #playerUrl
2530 results.append(info)
2535 class EscapistIE(InfoExtractor):
2536 """Information extractor for The Escapist """
2538 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2539 IE_NAME = u'escapist'
2541 def report_extraction(self, showName):
2542 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2544 def report_config_download(self, showName):
2545 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2547 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2552 showName = mobj.group('showname')
2553 videoId = mobj.group('episode')
2555 self.report_extraction(showName)
2557 webPage = compat_urllib_request.urlopen(url)
2558 webPageBytes = webPage.read()
2559 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2560 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2562 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2565 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2566 description = unescapeHTML(descMatch.group(1))
2567 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2568 imgUrl = unescapeHTML(imgMatch.group(1))
2569 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2570 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2571 configUrlMatch = re.search('config=(.*)$', playerUrl)
2572 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2574 self.report_config_download(showName)
2576 configJSON = compat_urllib_request.urlopen(configUrl)
2577 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2578 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2583 # Technically, it's JavaScript, not JSON
2584 configJSON = configJSON.replace("'", '"')
2587 config = json.loads(configJSON)
2588 except (ValueError,) as err:
2589 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2592 playlist = config['playlist']
2593 videoUrl = playlist[1]['url']
2598 'uploader': showName,
2599 'upload_date': None,
2602 'thumbnail': imgUrl,
2603 'description': description,
2604 'player_url': playerUrl,
2610 class CollegeHumorIE(InfoExtractor):
2611 """Information extractor for collegehumor.com"""
2614 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2615 IE_NAME = u'collegehumor'
2617 def report_manifest(self, video_id):
2618 """Report information extraction."""
2619 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2621 def report_extraction(self, video_id):
2622 """Report information extraction."""
2623 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2625 def _real_extract(self, url):
2626 mobj = re.match(self._VALID_URL, url)
2628 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2630 video_id = mobj.group('videoid')
2635 'upload_date': None,
2638 self.report_extraction(video_id)
2639 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2641 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2643 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2646 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2648 videoNode = mdoc.findall('./video')[0]
2649 info['description'] = videoNode.findall('./description')[0].text
2650 info['title'] = videoNode.findall('./caption')[0].text
2651 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2652 manifest_url = videoNode.findall('./file')[0].text
2654 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2657 manifest_url += '?hdcore=2.10.3'
2658 self.report_manifest(video_id)
2660 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2661 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2662 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2665 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2667 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2668 node_id = media_node.attrib['url']
2669 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2670 except IndexError as err:
2671 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2674 url_pr = compat_urllib_parse_urlparse(manifest_url)
2675 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2682 class XVideosIE(InfoExtractor):
2683 """Information extractor for xvideos.com"""
2685 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2686 IE_NAME = u'xvideos'
2688 def report_webpage(self, video_id):
2689 """Report information extraction."""
2690 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2692 def report_extraction(self, video_id):
2693 """Report information extraction."""
2694 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2696 def _real_extract(self, url):
2697 mobj = re.match(self._VALID_URL, url)
2699 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701 video_id = mobj.group(1)
2703 self.report_webpage(video_id)
2705 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2707 webpage_bytes = compat_urllib_request.urlopen(request).read()
2708 webpage = webpage_bytes.decode('utf-8', 'replace')
2709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2713 self.report_extraction(video_id)
2717 mobj = re.search(r'flv_url=(.+?)&', webpage)
2719 self._downloader.trouble(u'ERROR: unable to extract video url')
2721 video_url = compat_urllib_parse.unquote(mobj.group(1))
2725 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2727 self._downloader.trouble(u'ERROR: unable to extract video title')
2729 video_title = mobj.group(1)
2732 # Extract video thumbnail
2733 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2735 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2737 video_thumbnail = mobj.group(0)
2743 'upload_date': None,
2744 'title': video_title,
2746 'thumbnail': video_thumbnail,
2747 'description': None,
2753 class SoundcloudIE(InfoExtractor):
2754 """Information extractor for soundcloud.com
2755 To access the media, the uid of the song and a stream token
2756 must be extracted from the page source and the script must make
2757 a request to media.soundcloud.com/crossdomain.xml. Then
2758 the media can be grabbed by requesting from an url composed
2759 of the stream token and uid
2762 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2763 IE_NAME = u'soundcloud'
2765 def __init__(self, downloader=None):
2766 InfoExtractor.__init__(self, downloader)
2768 def report_resolve(self, video_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2772 def report_extraction(self, video_id):
2773 """Report information extraction."""
2774 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2776 def _real_extract(self, url):
2777 mobj = re.match(self._VALID_URL, url)
2779 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2782 # extract uploader (which is in the url)
2783 uploader = mobj.group(1)
2784 # extract simple title (uploader + slug of song title)
2785 slug_title = mobj.group(2)
2786 simple_title = uploader + u'-' + slug_title
2788 self.report_resolve('%s/%s' % (uploader, slug_title))
2790 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2791 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2792 request = compat_urllib_request.Request(resolv_url)
2794 info_json_bytes = compat_urllib_request.urlopen(request).read()
2795 info_json = info_json_bytes.decode('utf-8')
2796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2797 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2800 info = json.loads(info_json)
2801 video_id = info['id']
2802 self.report_extraction('%s/%s' % (uploader, slug_title))
2804 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2805 request = compat_urllib_request.Request(streams_url)
2807 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2808 stream_json = stream_json_bytes.decode('utf-8')
2809 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2813 streams = json.loads(stream_json)
2814 mediaURL = streams['http_mp3_128_url']
2819 'uploader': info['user']['username'],
2820 'upload_date': info['created_at'],
2821 'title': info['title'],
2823 'description': info['description'],
2827 class InfoQIE(InfoExtractor):
2828 """Information extractor for infoq.com"""
2830 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2833 def report_webpage(self, video_id):
2834 """Report information extraction."""
2835 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2837 def report_extraction(self, video_id):
2838 """Report information extraction."""
2839 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2841 def _real_extract(self, url):
2842 mobj = re.match(self._VALID_URL, url)
2844 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2847 self.report_webpage(url)
2849 request = compat_urllib_request.Request(url)
2851 webpage = compat_urllib_request.urlopen(request).read()
2852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2853 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2856 self.report_extraction(url)
2860 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2862 self._downloader.trouble(u'ERROR: unable to extract video url')
2864 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2868 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2870 self._downloader.trouble(u'ERROR: unable to extract video title')
2872 video_title = mobj.group(1).decode('utf-8')
2874 # Extract description
2875 video_description = u'No description available.'
2876 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2877 if mobj is not None:
2878 video_description = mobj.group(1).decode('utf-8')
2880 video_filename = video_url.split('/')[-1]
2881 video_id, extension = video_filename.split('.')
2887 'upload_date': None,
2888 'title': video_title,
2889 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2891 'description': video_description,
2896 class MixcloudIE(InfoExtractor):
2897 """Information extractor for www.mixcloud.com"""
2899 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2900 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2901 IE_NAME = u'mixcloud'
2903 def __init__(self, downloader=None):
2904 InfoExtractor.__init__(self, downloader)
2906 def report_download_json(self, file_id):
2907 """Report JSON download."""
2908 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2910 def report_extraction(self, file_id):
2911 """Report information extraction."""
2912 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2914 def get_urls(self, jsonData, fmt, bitrate='best'):
2915 """Get urls from 'audio_formats' section in json"""
2918 bitrate_list = jsonData[fmt]
2919 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2920 bitrate = max(bitrate_list) # select highest
2922 url_list = jsonData[fmt][bitrate]
2923 except TypeError: # we have no bitrate info.
2924 url_list = jsonData[fmt]
2927 def check_urls(self, url_list):
2928 """Returns 1st active url from list"""
2929 for url in url_list:
2931 compat_urllib_request.urlopen(url)
2933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2938 def _print_formats(self, formats):
2939 print('Available formats:')
2940 for fmt in formats.keys():
2941 for b in formats[fmt]:
2943 ext = formats[fmt][b][0]
2944 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2945 except TypeError: # we have no bitrate info
2946 ext = formats[fmt][0]
2947 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2950 def _real_extract(self, url):
2951 mobj = re.match(self._VALID_URL, url)
2953 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2955 # extract uploader & filename from url
2956 uploader = mobj.group(1).decode('utf-8')
2957 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2959 # construct API request
2960 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2961 # retrieve .json file with links to files
2962 request = compat_urllib_request.Request(file_url)
2964 self.report_download_json(file_url)
2965 jsonData = compat_urllib_request.urlopen(request).read()
2966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2967 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2971 json_data = json.loads(jsonData)
2972 player_url = json_data['player_swf_url']
2973 formats = dict(json_data['audio_formats'])
2975 req_format = self._downloader.params.get('format', None)
2978 if self._downloader.params.get('listformats', None):
2979 self._print_formats(formats)
2982 if req_format is None or req_format == 'best':
2983 for format_param in formats.keys():
2984 url_list = self.get_urls(formats, format_param)
2986 file_url = self.check_urls(url_list)
2987 if file_url is not None:
2990 if req_format not in list(formats.keys()):
2991 self._downloader.trouble(u'ERROR: format is not available')
2994 url_list = self.get_urls(formats, req_format)
2995 file_url = self.check_urls(url_list)
2996 format_param = req_format
2999 'id': file_id.decode('utf-8'),
3000 'url': file_url.decode('utf-8'),
3001 'uploader': uploader.decode('utf-8'),
3002 'upload_date': None,
3003 'title': json_data['name'],
3004 'ext': file_url.split('.')[-1].decode('utf-8'),
3005 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3006 'thumbnail': json_data['thumbnail_url'],
3007 'description': json_data['description'],
3008 'player_url': player_url.decode('utf-8'),
3011 class StanfordOpenClassroomIE(InfoExtractor):
3012 """Information extractor for Stanford's Open ClassRoom"""
3014 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3015 IE_NAME = u'stanfordoc'
3017 def report_download_webpage(self, objid):
3018 """Report information extraction."""
3019 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3021 def report_extraction(self, video_id):
3022 """Report information extraction."""
3023 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3025 def _real_extract(self, url):
3026 mobj = re.match(self._VALID_URL, url)
3028 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3031 if mobj.group('course') and mobj.group('video'): # A specific video
3032 course = mobj.group('course')
3033 video = mobj.group('video')
3035 'id': course + '_' + video,
3037 'upload_date': None,
3040 self.report_extraction(info['id'])
3041 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3042 xmlUrl = baseUrl + video + '.xml'
3044 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3045 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3046 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3048 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3050 info['title'] = mdoc.findall('./title')[0].text
3051 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3053 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3055 info['ext'] = info['url'].rpartition('.')[2]
3057 elif mobj.group('course'): # A course page
3058 course = mobj.group('course')
3063 'upload_date': None,
3066 self.report_download_webpage(info['id'])
3068 coursepage = compat_urllib_request.urlopen(url).read()
3069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3070 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3073 m = re.search('<h1>([^<]+)</h1>', coursepage)
3075 info['title'] = unescapeHTML(m.group(1))
3077 info['title'] = info['id']
3079 m = re.search('<description>([^<]+)</description>', coursepage)
3081 info['description'] = unescapeHTML(m.group(1))
3083 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3086 'type': 'reference',
3087 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3091 for entry in info['list']:
3092 assert entry['type'] == 'reference'
3093 results += self.extract(entry['url'])
3098 'id': 'Stanford OpenClassroom',
3101 'upload_date': None,
3104 self.report_download_webpage(info['id'])
3105 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3107 rootpage = compat_urllib_request.urlopen(rootURL).read()
3108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3109 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3112 info['title'] = info['id']
3114 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3117 'type': 'reference',
3118 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3123 for entry in info['list']:
3124 assert entry['type'] == 'reference'
3125 results += self.extract(entry['url'])
3128 class MTVIE(InfoExtractor):
3129 """Information extractor for MTV.com"""
3131 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3134 def report_webpage(self, video_id):
3135 """Report information extraction."""
3136 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3138 def report_extraction(self, video_id):
3139 """Report information extraction."""
3140 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3142 def _real_extract(self, url):
3143 mobj = re.match(self._VALID_URL, url)
3145 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3147 if not mobj.group('proto'):
3148 url = 'http://' + url
3149 video_id = mobj.group('videoid')
3150 self.report_webpage(video_id)
3152 request = compat_urllib_request.Request(url)
3154 webpage = compat_urllib_request.urlopen(request).read()
3155 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3156 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3159 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3161 self._downloader.trouble(u'ERROR: unable to extract song name')
3163 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3164 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3166 self._downloader.trouble(u'ERROR: unable to extract performer')
3168 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3169 video_title = performer + ' - ' + song_name
3171 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3173 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3175 mtvn_uri = mobj.group(1)
3177 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3179 self._downloader.trouble(u'ERROR: unable to extract content id')
3181 content_id = mobj.group(1)
3183 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3184 self.report_extraction(video_id)
3185 request = compat_urllib_request.Request(videogen_url)
3187 metadataXml = compat_urllib_request.urlopen(request).read()
3188 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3189 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3192 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3193 renditions = mdoc.findall('.//rendition')
3195 # For now, always pick the highest quality.
3196 rendition = renditions[-1]
3199 _,_,ext = rendition.attrib['type'].partition('/')
3200 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3201 video_url = rendition.find('./src').text
3203 self._downloader.trouble('Invalid rendition field.')
3209 'uploader': performer,
3210 'upload_date': None,
3211 'title': video_title,
3219 class YoukuIE(InfoExtractor):
3221 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3224 def __init__(self, downloader=None):
3225 InfoExtractor.__init__(self, downloader)
3227 def report_download_webpage(self, file_id):
3228 """Report webpage download."""
3229 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3231 def report_extraction(self, file_id):
3232 """Report information extraction."""
3233 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3236 nowTime = int(time.time() * 1000)
3237 random1 = random.randint(1000,1998)
3238 random2 = random.randint(1000,9999)
3240 return "%d%d%d" %(nowTime,random1,random2)
3242 def _get_file_ID_mix_string(self, seed):
3244 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3246 for i in range(len(source)):
3247 seed = (seed * 211 + 30031 ) % 65536
3248 index = math.floor(seed / 65536 * len(source) )
3249 mixed.append(source[int(index)])
3250 source.remove(source[int(index)])
3251 #return ''.join(mixed)
3254 def _get_file_id(self, fileId, seed):
3255 mixed = self._get_file_ID_mix_string(seed)
3256 ids = fileId.split('*')
3260 realId.append(mixed[int(ch)])
3261 return ''.join(realId)
3263 def _real_extract(self, url):
3264 mobj = re.match(self._VALID_URL, url)
3266 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3268 video_id = mobj.group('ID')
3270 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3272 request = compat_urllib_request.Request(info_url, None, std_headers)
3274 self.report_download_webpage(video_id)
3275 jsondata = compat_urllib_request.urlopen(request).read()
3276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3277 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3280 self.report_extraction(video_id)
3282 jsonstr = jsondata.decode('utf-8')
3283 config = json.loads(jsonstr)
3285 video_title = config['data'][0]['title']
3286 seed = config['data'][0]['seed']
3288 format = self._downloader.params.get('format', None)
3289 supported_format = list(config['data'][0]['streamfileids'].keys())
3291 if format is None or format == 'best':
3292 if 'hd2' in supported_format:
3297 elif format == 'worst':
3305 fileid = config['data'][0]['streamfileids'][format]
3306 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3307 except (UnicodeDecodeError, ValueError, KeyError):
3308 self._downloader.trouble(u'ERROR: unable to extract info section')
3312 sid = self._gen_sid()
3313 fileid = self._get_file_id(fileid, seed)
3315 #column 8,9 of fileid represent the segment number
3316 #fileid[7:9] should be changed
3317 for index, key in enumerate(keys):
3319 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3320 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3323 'id': '%s_part%02d' % (video_id, index),
3324 'url': download_url,
3326 'upload_date': None,
3327 'title': video_title,
3330 files_info.append(info)
3335 class XNXXIE(InfoExtractor):
3336 """Information extractor for xnxx.com"""
3338 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3340 VIDEO_URL_RE = r'flv_url=(.*?)&'
3341 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3342 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3344 def report_webpage(self, video_id):
3345 """Report information extraction"""
3346 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3348 def report_extraction(self, video_id):
3349 """Report information extraction"""
3350 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3352 def _real_extract(self, url):
3353 mobj = re.match(self._VALID_URL, url)
3355 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3357 video_id = mobj.group(1)
3359 self.report_webpage(video_id)
3361 # Get webpage content
3363 webpage_bytes = compat_urllib_request.urlopen(url).read()
3364 webpage = webpage_bytes.decode('utf-8')
3365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3369 result = re.search(self.VIDEO_URL_RE, webpage)
3371 self._downloader.trouble(u'ERROR: unable to extract video url')
3373 video_url = compat_urllib_parse.unquote(result.group(1))
3375 result = re.search(self.VIDEO_TITLE_RE, webpage)
3377 self._downloader.trouble(u'ERROR: unable to extract video title')
3379 video_title = result.group(1)
3381 result = re.search(self.VIDEO_THUMB_RE, webpage)
3383 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3385 video_thumbnail = result.group(1)
3391 'upload_date': None,
3392 'title': video_title,
3394 'thumbnail': video_thumbnail,
3395 'description': None,
3399 class GooglePlusIE(InfoExtractor):
3400 """Information extractor for plus.google.com."""
3402 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3403 IE_NAME = u'plus.google'
3405 def __init__(self, downloader=None):
3406 InfoExtractor.__init__(self, downloader)
3408 def report_extract_entry(self, url):
3409 """Report downloading extry"""
3410 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3412 def report_date(self, upload_date):
3413 """Report downloading extry"""
3414 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3416 def report_uploader(self, uploader):
3417 """Report downloading extry"""
3418 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3420 def report_title(self, video_title):
3421 """Report downloading extry"""
3422 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3424 def report_extract_vid_page(self, video_page):
3425 """Report information extraction."""
3426 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3428 def _real_extract(self, url):
3429 # Extract id from URL
3430 mobj = re.match(self._VALID_URL, url)
3432 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3435 post_url = mobj.group(0)
3436 video_id = mobj.group(1)
3438 video_extension = 'flv'
3440 # Step 1, Retrieve post webpage to extract further information
3441 self.report_extract_entry(post_url)
3442 request = compat_urllib_request.Request(post_url)
3444 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3446 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3449 # Extract update date
3451 pattern = 'title="Timestamp">(.*?)</a>'
3452 mobj = re.search(pattern, webpage)
3454 upload_date = mobj.group(1)
3455 # Convert timestring to a format suitable for filename
3456 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3457 upload_date = upload_date.strftime('%Y%m%d')
3458 self.report_date(upload_date)
3462 pattern = r'rel\="author".*?>(.*?)</a>'
3463 mobj = re.search(pattern, webpage)
3465 uploader = mobj.group(1)
3466 self.report_uploader(uploader)
3469 # Get the first line for title
3471 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3472 mobj = re.search(pattern, webpage)
3474 video_title = mobj.group(1)
3475 self.report_title(video_title)
3477 # Step 2, Stimulate clicking the image box to launch video
3478 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3479 mobj = re.search(pattern, webpage)
3481 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3483 video_page = mobj.group(1)
3484 request = compat_urllib_request.Request(video_page)
3486 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3487 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3488 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3490 self.report_extract_vid_page(video_page)
3493 # Extract video links on video page
3494 """Extract video links of all sizes"""
3495 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3496 mobj = re.findall(pattern, webpage)
3498 self._downloader.trouble(u'ERROR: unable to extract video links')
3500 # Sort in resolution
3501 links = sorted(mobj)
3503 # Choose the lowest of the sort, i.e. highest resolution
3504 video_url = links[-1]
3505 # Only get the url. The resolution part in the tuple has no use anymore
3506 video_url = video_url[-1]
3507 # Treat escaped \u0026 style hex
3509 video_url = video_url.decode("unicode_escape")
3510 except AttributeError: # Python 3
3511 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3517 'uploader': uploader,
3518 'upload_date': upload_date,
3519 'title': video_title,
3520 'ext': video_extension,
3523 class NBAIE(InfoExtractor):
3524 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3527 def report_extraction(self, video_id):
3528 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3530 def _real_extract(self, url):
3531 mobj = re.match(self._VALID_URL, url)
3533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3536 video_id = mobj.group(1)
3537 if video_id.endswith('/index.html'):
3538 video_id = video_id[:-len('/index.html')]
3540 self.report_extraction(video_id)
3542 urlh = compat_urllib_request.urlopen(url)
3543 webpage_bytes = urlh.read()
3544 webpage = webpage_bytes.decode('utf-8', 'ignore')
3545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3549 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3550 def _findProp(rexp, default=None):
3551 m = re.search(rexp, webpage)
3553 return unescapeHTML(m.group(1))
3557 shortened_video_id = video_id.rpartition('/')[2]
3558 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3560 'id': shortened_video_id,
3564 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3565 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3569 class JustinTVIE(InfoExtractor):
3570 """Information extractor for justin.tv and twitch.tv"""
3571 # TODO: One broadcast may be split into multiple videos. The key
3572 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3573 # starts at 1 and increases. Can we treat all parts as one video?
3575 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3576 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3577 _JUSTIN_PAGE_LIMIT = 100
3578 IE_NAME = u'justin.tv'
3580 def report_extraction(self, file_id):
3581 """Report information extraction."""
3582 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3584 def report_download_page(self, channel, offset):
3585 """Report attempt to download a single page of videos."""
3586 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3587 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3589 # Return count of items, list of *valid* items
3590 def _parse_page(self, url):
3592 urlh = compat_urllib_request.urlopen(url)
3593 webpage_bytes = urlh.read()
3594 webpage = webpage_bytes.decode('utf-8', 'ignore')
3595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3596 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3599 response = json.loads(webpage)
3601 for clip in response:
3602 video_url = clip['video_file_url']
3604 video_extension = os.path.splitext(video_url)[1][1:]
3605 video_date = re.sub('-', '', clip['created_on'][:10])
3609 'title': clip['title'],
3610 'uploader': clip.get('user_id', clip.get('channel_id')),
3611 'upload_date': video_date,
3612 'ext': video_extension,
3614 return (len(response), info)
3616 def _real_extract(self, url):
3617 mobj = re.match(self._VALID_URL, url)
3619 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3622 api = 'http://api.justin.tv'
3623 video_id = mobj.group(mobj.lastindex)
3625 if mobj.lastindex == 1:
3627 api += '/channel/archives/%s.json'
3629 api += '/clip/show/%s.json'
3630 api = api % (video_id,)
3632 self.report_extraction(video_id)
3636 limit = self._JUSTIN_PAGE_LIMIT
3639 self.report_download_page(video_id, offset)
3640 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3641 page_count, page_info = self._parse_page(page_url)
3642 info.extend(page_info)
3643 if not paged or page_count != limit:
3648 class FunnyOrDieIE(InfoExtractor):
3649 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3650 IE_NAME = u'FunnyOrDie'
3652 def report_extraction(self, video_id):
3653 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3655 def _real_extract(self, url):
3656 mobj = re.match(self._VALID_URL, url)
3658 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3661 video_id = mobj.group('id')
3662 self.report_extraction(video_id)
3664 urlh = compat_urllib_request.urlopen(url)
3665 webpage_bytes = urlh.read()
3666 webpage = webpage_bytes.decode('utf-8', 'ignore')
3667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3668 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3671 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3673 self._downloader.trouble(u'ERROR: unable to find video information')
3674 video_url = unescapeHTML(m.group('url'))
3677 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3679 self._downloader.trouble(u'Cannot find video title')
3680 title = unescapeHTML(m.group('title'))
3682 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3684 desc = unescapeHTML(m.group('desc'))
3693 'description': desc,