2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
38 uploader: Full name of the video uploader.
39 upload_date: Video upload date (YYYYMMDD).
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader_id: Nickname or id of the video uploader.
47 player_url: SWF Player URL (used for rtmpdump).
48 subtitles: The .srt file contents.
49 urlhandle: [internal] The urlHandle to be used to download the file,
50 like returned by urllib.request.urlopen
52 The fields should all be Unicode strings.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
58 _real_extract() must return a *list* of information dictionaries as
61 Finally, the _WORKING attribute should be set to False for broken IEs
62 in order to warn the users and skip the tests.
69 def __init__(self, downloader=None):
70 """Constructor. Receives an optional downloader."""
72 self.set_downloader(downloader)
74 def suitable(self, url):
75 """Receives a URL and returns True if suitable for this IE."""
76 return re.match(self._VALID_URL, url) is not None
79 """Getter method for _WORKING."""
83 """Initializes an instance (authentication, etc)."""
85 self._real_initialize()
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
91 return self._real_extract(url)
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
107 return type(self).__name__[:-2]
109 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
110 """ Returns the response handle """
112 note = u'Downloading video webpage'
113 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
115 return compat_urllib_request.urlopen(url_or_request)
116 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 errnote = u'Unable to download webpage'
119 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
121 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
122 """ Returns the data of the page as a string """
123 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
124 webpage_bytes = urlh.read()
125 return webpage_bytes.decode('utf-8', 'replace')
128 class YoutubeIE(InfoExtractor):
129 """Information extractor for youtube.com."""
133 (?:https?://)? # http(s):// (optional)
134 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
135 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
136 (?:.*?\#/)? # handle anchor (#/) redirect urls
137 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
138 (?: # the various things that can precede the ID:
139 (?:(?:v|embed|e)/) # v/ or embed/ or e/
140 |(?: # or the v= param in all its forms
141 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
142 (?:\?|\#!?) # the params delimiter ? or # or #!
143 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
146 )? # optional -> youtube.com/xxxx is OK
147 )? # all until now is optional -> you can pass the naked ID
148 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
149 (?(1).+)? # if we found the ID, everything can follow
151 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
152 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
153 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
154 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
155 _NETRC_MACHINE = 'youtube'
156 # Listed in order of quality
157 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
158 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
159 _video_extensions = {
165 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
171 _video_dimensions = {
189 def suitable(self, url):
190 """Receives a URL and returns True if suitable for this IE."""
191 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
193 def report_lang(self):
194 """Report attempt to set language."""
195 self._downloader.to_screen(u'[youtube] Setting language')
197 def report_login(self):
198 """Report attempt to log in."""
199 self._downloader.to_screen(u'[youtube] Logging in')
201 def report_age_confirmation(self):
202 """Report attempt to confirm age."""
203 self._downloader.to_screen(u'[youtube] Confirming age')
205 def report_video_webpage_download(self, video_id):
206 """Report attempt to download video webpage."""
207 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
209 def report_video_info_webpage_download(self, video_id):
210 """Report attempt to download video info webpage."""
211 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
213 def report_video_subtitles_download(self, video_id):
214 """Report attempt to download video info webpage."""
215 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
217 def report_information_extraction(self, video_id):
218 """Report attempt to extract video information."""
219 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
221 def report_unavailable_format(self, video_id, format):
222 """Report extracted video URL."""
223 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
225 def report_rtmp_download(self):
226 """Indicate the download will use the RTMP protocol."""
227 self._downloader.to_screen(u'[youtube] RTMP download detected')
229 def _closed_captions_xml_to_srt(self, xml_string):
231 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
232 # TODO parse xml instead of regex
233 for n, (start, dur_tag, dur, caption) in enumerate(texts):
234 if not dur: dur = '4'
236 end = start + float(dur)
237 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
238 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
239 caption = unescapeHTML(caption)
240 caption = unescapeHTML(caption) # double cycle, intentional
241 srt += str(n+1) + '\n'
242 srt += start + ' --> ' + end + '\n'
243 srt += caption + '\n\n'
246 def _extract_subtitles(self, video_id):
247 self.report_video_subtitles_download(video_id)
248 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
250 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
251 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
252 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
253 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
254 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
255 if not srt_lang_list:
256 return (u'WARNING: video has no closed captions', None)
257 if self._downloader.params.get('subtitleslang', False):
258 srt_lang = self._downloader.params.get('subtitleslang')
259 elif 'en' in srt_lang_list:
262 srt_lang = list(srt_lang_list.keys())[0]
263 if not srt_lang in srt_lang_list:
264 return (u'WARNING: no closed captions found in the specified language', None)
265 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
267 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
268 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
269 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
271 return (u'WARNING: unable to download video subtitles', None)
272 return (None, self._closed_captions_xml_to_srt(srt_xml))
274 def _print_formats(self, formats):
275 print('Available formats:')
277 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
279 def _real_initialize(self):
280 if self._downloader is None:
285 downloader_params = self._downloader.params
287 # Attempt to use provided username and password or .netrc data
288 if downloader_params.get('username', None) is not None:
289 username = downloader_params['username']
290 password = downloader_params['password']
291 elif downloader_params.get('usenetrc', False):
293 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
298 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
299 except (IOError, netrc.NetrcParseError) as err:
300 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
304 request = compat_urllib_request.Request(self._LANG_URL)
307 compat_urllib_request.urlopen(request).read()
308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
309 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
312 # No authentication to be performed
318 'current_form': 'loginForm',
320 'action_login': 'Log In',
321 'username': username,
322 'password': password,
324 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
327 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
328 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
329 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
338 'action_confirm': 'Confirm',
340 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
342 self.report_age_confirmation()
343 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
344 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
345 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
348 def _extract_id(self, url):
349 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
351 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
353 video_id = mobj.group(2)
356 def _real_extract(self, url):
357 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
358 mobj = re.search(self._NEXT_URL_RE, url)
360 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
361 video_id = self._extract_id(url)
364 self.report_video_webpage_download(video_id)
365 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
366 request = compat_urllib_request.Request(url)
368 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
369 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
370 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
373 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
375 # Attempt to extract SWF player URL
376 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
378 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
383 self.report_video_info_webpage_download(video_id)
384 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
385 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
386 % (video_id, el_type))
387 request = compat_urllib_request.Request(video_info_url)
389 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
390 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
391 video_info = compat_parse_qs(video_info_webpage)
392 if 'token' in video_info:
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
397 if 'token' not in video_info:
398 if 'reason' in video_info:
399 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
401 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
404 # Check for "rental" videos
405 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
406 self._downloader.trouble(u'ERROR: "rental" videos not supported')
409 # Start extracting information
410 self.report_information_extraction(video_id)
413 if 'author' not in video_info:
414 self._downloader.trouble(u'ERROR: unable to extract uploader name')
416 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
419 video_uploader_id = None
420 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
422 video_uploader_id = mobj.group(1)
424 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
427 if 'title' not in video_info:
428 self._downloader.trouble(u'ERROR: unable to extract video title')
430 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
433 if 'thumbnail_url' not in video_info:
434 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
436 else: # don't panic if we can't find it
437 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
441 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
443 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
444 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
445 for expression in format_expressions:
447 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
452 video_description = get_element_by_id("eow-description", video_webpage)
453 if video_description:
454 video_description = clean_html(video_description)
456 video_description = ''
459 video_subtitles = None
460 if self._downloader.params.get('writesubtitles', False):
461 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
463 self._downloader.trouble(srt_error)
465 if 'length_seconds' not in video_info:
466 self._downloader.trouble(u'WARNING: unable to extract video duration')
469 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
472 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
474 # Decide which formats to download
475 req_format = self._downloader.params.get('format', None)
477 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
478 self.report_rtmp_download()
479 video_url_list = [(None, video_info['conn'][0])]
480 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
481 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
482 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
483 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
484 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
486 format_limit = self._downloader.params.get('format_limit', None)
487 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
488 if format_limit is not None and format_limit in available_formats:
489 format_list = available_formats[available_formats.index(format_limit):]
491 format_list = available_formats
492 existing_formats = [x for x in format_list if x in url_map]
493 if len(existing_formats) == 0:
494 self._downloader.trouble(u'ERROR: no known formats available for video')
496 if self._downloader.params.get('listformats', None):
497 self._print_formats(existing_formats)
499 if req_format is None or req_format == 'best':
500 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
501 elif req_format == 'worst':
502 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
503 elif req_format in ('-1', 'all'):
504 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
506 # Specific formats. We pick the first in a slash-delimeted sequence.
507 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
508 req_formats = req_format.split('/')
509 video_url_list = None
510 for rf in req_formats:
512 video_url_list = [(rf, url_map[rf])]
514 if video_url_list is None:
515 self._downloader.trouble(u'ERROR: requested format not available')
518 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
522 for format_param, video_real_url in video_url_list:
524 video_extension = self._video_extensions.get(format_param, 'flv')
526 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
527 self._video_dimensions.get(format_param, '???'))
531 'url': video_real_url,
532 'uploader': video_uploader,
533 'uploader_id': video_uploader_id,
534 'upload_date': upload_date,
535 'title': video_title,
536 'ext': video_extension,
537 'format': video_format,
538 'thumbnail': video_thumbnail,
539 'description': video_description,
540 'player_url': player_url,
541 'subtitles': video_subtitles,
542 'duration': video_duration
547 class MetacafeIE(InfoExtractor):
548 """Information Extractor for metacafe.com."""
550 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
551 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
552 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
553 IE_NAME = u'metacafe'
555 def __init__(self, downloader=None):
556 InfoExtractor.__init__(self, downloader)
558 def report_disclaimer(self):
559 """Report disclaimer retrieval."""
560 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
562 def report_age_confirmation(self):
563 """Report attempt to confirm age."""
564 self._downloader.to_screen(u'[metacafe] Confirming age')
566 def report_download_webpage(self, video_id):
567 """Report webpage download."""
568 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
570 def report_extraction(self, video_id):
571 """Report information extraction."""
572 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
574 def _real_initialize(self):
575 # Retrieve disclaimer
576 request = compat_urllib_request.Request(self._DISCLAIMER)
578 self.report_disclaimer()
579 disclaimer = compat_urllib_request.urlopen(request).read()
580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
581 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
587 'submit': "Continue - I'm over 18",
589 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
591 self.report_age_confirmation()
592 disclaimer = compat_urllib_request.urlopen(request).read()
593 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
594 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
597 def _real_extract(self, url):
598 # Extract id and simplified title from URL
599 mobj = re.match(self._VALID_URL, url)
601 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
604 video_id = mobj.group(1)
606 # Check if video comes from YouTube
607 mobj2 = re.match(r'^yt-(.*)$', video_id)
608 if mobj2 is not None:
609 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
612 # Retrieve video webpage to extract further information
613 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
615 self.report_download_webpage(video_id)
616 webpage = compat_urllib_request.urlopen(request).read()
617 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
618 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
621 # Extract URL, uploader and title from webpage
622 self.report_extraction(video_id)
623 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
625 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
626 video_extension = mediaURL[-3:]
628 # Extract gdaKey if available
629 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
633 gdaKey = mobj.group(1)
634 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
636 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
638 self._downloader.trouble(u'ERROR: unable to extract media URL')
640 vardict = compat_parse_qs(mobj.group(1))
641 if 'mediaData' not in vardict:
642 self._downloader.trouble(u'ERROR: unable to extract media URL')
644 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
646 self._downloader.trouble(u'ERROR: unable to extract media URL')
648 mediaURL = mobj.group(1).replace('\\/', '/')
649 video_extension = mediaURL[-3:]
650 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
652 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
654 self._downloader.trouble(u'ERROR: unable to extract title')
656 video_title = mobj.group(1).decode('utf-8')
658 mobj = re.search(r'submitter=(.*?);', webpage)
660 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
662 video_uploader = mobj.group(1)
665 'id': video_id.decode('utf-8'),
666 'url': video_url.decode('utf-8'),
667 'uploader': video_uploader.decode('utf-8'),
669 'title': video_title,
670 'ext': video_extension.decode('utf-8'),
674 class DailymotionIE(InfoExtractor):
675 """Information Extractor for Dailymotion"""
677 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
678 IE_NAME = u'dailymotion'
680 def __init__(self, downloader=None):
681 InfoExtractor.__init__(self, downloader)
683 def report_extraction(self, video_id):
684 """Report information extraction."""
685 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
687 def _real_extract(self, url):
688 # Extract id and simplified title from URL
689 mobj = re.match(self._VALID_URL, url)
691 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
694 video_id = mobj.group(1).split('_')[0].split('?')[0]
696 video_extension = 'mp4'
698 # Retrieve video webpage to extract further information
699 request = compat_urllib_request.Request(url)
700 request.add_header('Cookie', 'family_filter=off')
701 webpage = self._download_webpage(request, video_id)
703 # Extract URL, uploader and title from webpage
704 self.report_extraction(video_id)
705 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
707 self._downloader.trouble(u'ERROR: unable to extract media URL')
709 flashvars = compat_urllib_parse.unquote(mobj.group(1))
711 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
714 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
717 self._downloader.trouble(u'ERROR: unable to extract video URL')
720 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
722 self._downloader.trouble(u'ERROR: unable to extract video URL')
725 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
727 # TODO: support choosing qualities
729 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
731 self._downloader.trouble(u'ERROR: unable to extract title')
733 video_title = unescapeHTML(mobj.group('title'))
735 video_uploader = None
736 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
738 # lookin for official user
739 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
740 if mobj_official is None:
741 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
743 video_uploader = mobj_official.group(1)
745 video_uploader = mobj.group(1)
747 video_upload_date = None
748 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
750 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
755 'uploader': video_uploader,
756 'upload_date': video_upload_date,
757 'title': video_title,
758 'ext': video_extension,
762 class PhotobucketIE(InfoExtractor):
763 """Information extractor for photobucket.com."""
765 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
766 IE_NAME = u'photobucket'
768 def __init__(self, downloader=None):
769 InfoExtractor.__init__(self, downloader)
771 def report_download_webpage(self, video_id):
772 """Report webpage download."""
773 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
775 def report_extraction(self, video_id):
776 """Report information extraction."""
777 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
779 def _real_extract(self, url):
780 # Extract id from URL
781 mobj = re.match(self._VALID_URL, url)
783 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
786 video_id = mobj.group(1)
788 video_extension = 'flv'
790 # Retrieve video webpage to extract further information
791 request = compat_urllib_request.Request(url)
793 self.report_download_webpage(video_id)
794 webpage = compat_urllib_request.urlopen(request).read()
795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
796 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
799 # Extract URL, uploader, and title from webpage
800 self.report_extraction(video_id)
801 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
803 self._downloader.trouble(u'ERROR: unable to extract media URL')
805 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
809 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
811 self._downloader.trouble(u'ERROR: unable to extract title')
813 video_title = mobj.group(1).decode('utf-8')
815 video_uploader = mobj.group(2).decode('utf-8')
818 'id': video_id.decode('utf-8'),
819 'url': video_url.decode('utf-8'),
820 'uploader': video_uploader,
822 'title': video_title,
823 'ext': video_extension.decode('utf-8'),
827 class YahooIE(InfoExtractor):
828 """Information extractor for video.yahoo.com."""
831 # _VALID_URL matches all Yahoo! Video URLs
832 # _VPAGE_URL matches only the extractable '/watch/' URLs
833 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
834 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
835 IE_NAME = u'video.yahoo'
837 def __init__(self, downloader=None):
838 InfoExtractor.__init__(self, downloader)
840 def report_download_webpage(self, video_id):
841 """Report webpage download."""
842 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
844 def report_extraction(self, video_id):
845 """Report information extraction."""
846 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
848 def _real_extract(self, url, new_video=True):
849 # Extract ID from URL
850 mobj = re.match(self._VALID_URL, url)
852 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
855 video_id = mobj.group(2)
856 video_extension = 'flv'
858 # Rewrite valid but non-extractable URLs as
859 # extractable English language /watch/ URLs
860 if re.match(self._VPAGE_URL, url) is None:
861 request = compat_urllib_request.Request(url)
863 webpage = compat_urllib_request.urlopen(request).read()
864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
865 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
868 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
870 self._downloader.trouble(u'ERROR: Unable to extract id field')
872 yahoo_id = mobj.group(1)
874 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
876 self._downloader.trouble(u'ERROR: Unable to extract vid field')
878 yahoo_vid = mobj.group(1)
880 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
881 return self._real_extract(url, new_video=False)
883 # Retrieve video webpage to extract further information
884 request = compat_urllib_request.Request(url)
886 self.report_download_webpage(video_id)
887 webpage = compat_urllib_request.urlopen(request).read()
888 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
892 # Extract uploader and title from webpage
893 self.report_extraction(video_id)
894 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
896 self._downloader.trouble(u'ERROR: unable to extract video title')
898 video_title = mobj.group(1).decode('utf-8')
900 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
902 self._downloader.trouble(u'ERROR: unable to extract video uploader')
904 video_uploader = mobj.group(1).decode('utf-8')
906 # Extract video thumbnail
907 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
909 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
911 video_thumbnail = mobj.group(1).decode('utf-8')
913 # Extract video description
914 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
916 self._downloader.trouble(u'ERROR: unable to extract video description')
918 video_description = mobj.group(1).decode('utf-8')
919 if not video_description:
920 video_description = 'No description available.'
922 # Extract video height and width
923 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
925 self._downloader.trouble(u'ERROR: unable to extract video height')
927 yv_video_height = mobj.group(1)
929 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
931 self._downloader.trouble(u'ERROR: unable to extract video width')
933 yv_video_width = mobj.group(1)
935 # Retrieve video playlist to extract media URL
936 # I'm not completely sure what all these options are, but we
937 # seem to need most of them, otherwise the server sends a 401.
938 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
939 yv_bitrate = '700' # according to Wikipedia this is hard-coded
940 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
941 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
942 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
944 self.report_download_webpage(video_id)
945 webpage = compat_urllib_request.urlopen(request).read()
946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
947 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
950 # Extract media URL from playlist XML
951 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
953 self._downloader.trouble(u'ERROR: Unable to extract media URL')
955 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
956 video_url = unescapeHTML(video_url)
959 'id': video_id.decode('utf-8'),
961 'uploader': video_uploader,
963 'title': video_title,
964 'ext': video_extension.decode('utf-8'),
965 'thumbnail': video_thumbnail.decode('utf-8'),
966 'description': video_description,
970 class VimeoIE(InfoExtractor):
971 """Information extractor for vimeo.com."""
973 # _VALID_URL matches Vimeo URLs
974 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
977 def __init__(self, downloader=None):
978 InfoExtractor.__init__(self, downloader)
980 def report_download_webpage(self, video_id):
981 """Report webpage download."""
982 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
984 def report_extraction(self, video_id):
985 """Report information extraction."""
986 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
988 def _real_extract(self, url, new_video=True):
989 # Extract ID from URL
990 mobj = re.match(self._VALID_URL, url)
992 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
995 video_id = mobj.group(1)
997 # Retrieve video webpage to extract further information
998 request = compat_urllib_request.Request(url, None, std_headers)
1000 self.report_download_webpage(video_id)
1001 webpage_bytes = compat_urllib_request.urlopen(request).read()
1002 webpage = webpage_bytes.decode('utf-8')
1003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1007 # Now we begin extracting as much information as we can from what we
1008 # retrieved. First we extract the information common to all extractors,
1009 # and latter we extract those that are Vimeo specific.
1010 self.report_extraction(video_id)
1012 # Extract the config JSON
1014 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1015 config = json.loads(config)
1017 self._downloader.trouble(u'ERROR: unable to extract info section')
1021 video_title = config["video"]["title"]
1023 # Extract uploader and uploader_id
1024 video_uploader = config["video"]["owner"]["name"]
1025 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1027 # Extract video thumbnail
1028 video_thumbnail = config["video"]["thumbnail"]
1030 # Extract video description
1031 video_description = get_element_by_attribute("itemprop", "description", webpage)
1032 if video_description: video_description = clean_html(video_description)
1033 else: video_description = ''
1035 # Extract upload date
1036 video_upload_date = None
1037 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038 if mobj is not None:
1039 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1041 # Vimeo specific: extract request signature and timestamp
1042 sig = config['request']['signature']
1043 timestamp = config['request']['timestamp']
1045 # Vimeo specific: extract video codec and quality information
1046 # First consider quality, then codecs, then take everything
1047 # TODO bind to format param
1048 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049 files = { 'hd': [], 'sd': [], 'other': []}
1050 for codec_name, codec_extension in codecs:
1051 if codec_name in config["video"]["files"]:
1052 if 'hd' in config["video"]["files"][codec_name]:
1053 files['hd'].append((codec_name, codec_extension, 'hd'))
1054 elif 'sd' in config["video"]["files"][codec_name]:
1055 files['sd'].append((codec_name, codec_extension, 'sd'))
1057 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1059 for quality in ('hd', 'sd', 'other'):
1060 if len(files[quality]) > 0:
1061 video_quality = files[quality][0][2]
1062 video_codec = files[quality][0][0]
1063 video_extension = files[quality][0][1]
1064 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1067 self._downloader.trouble(u'ERROR: no known codec found')
1070 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1071 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1076 'uploader': video_uploader,
1077 'uploader_id': video_uploader_id,
1078 'upload_date': video_upload_date,
1079 'title': video_title,
1080 'ext': video_extension,
1081 'thumbnail': video_thumbnail,
1082 'description': video_description,
1086 class ArteTvIE(InfoExtractor):
1087 """arte.tv information extractor."""
1089 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1090 _LIVE_URL = r'index-[0-9]+\.html$'
1092 IE_NAME = u'arte.tv'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1105 def fetch_webpage(self, url):
1106 request = compat_urllib_request.Request(url)
1108 self.report_download_webpage(url)
1109 webpage = compat_urllib_request.urlopen(request).read()
1110 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1111 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1113 except ValueError as err:
1114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1118 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1119 page = self.fetch_webpage(url)
1120 mobj = re.search(regex, page, regexFlags)
1124 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1127 for (i, key, err) in matchTuples:
1128 if mobj.group(i) is None:
1129 self._downloader.trouble(err)
1132 info[key] = mobj.group(i)
1136 def extractLiveStream(self, url):
1137 video_lang = url.split('/')[-4]
1138 info = self.grep_webpage(
1140 r'src="(.*?/videothek_js.*?\.js)',
1143 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1146 http_host = url.split('/')[2]
1147 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1148 info = self.grep_webpage(
1150 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1151 '(http://.*?\.swf).*?' +
1155 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1156 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1157 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1160 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1162 def extractPlus7Stream(self, url):
1163 video_lang = url.split('/')[-3]
1164 info = self.grep_webpage(
1166 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1169 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1172 next_url = compat_urllib_parse.unquote(info.get('url'))
1173 info = self.grep_webpage(
1175 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1178 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1181 next_url = compat_urllib_parse.unquote(info.get('url'))
1183 info = self.grep_webpage(
1185 r'<video id="(.*?)".*?>.*?' +
1186 '<name>(.*?)</name>.*?' +
1187 '<dateVideo>(.*?)</dateVideo>.*?' +
1188 '<url quality="hd">(.*?)</url>',
1191 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1192 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1193 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1194 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1199 'id': info.get('id'),
1200 'url': compat_urllib_parse.unquote(info.get('url')),
1201 'uploader': u'arte.tv',
1202 'upload_date': info.get('date'),
1203 'title': info.get('title').decode('utf-8'),
1209 def _real_extract(self, url):
1210 video_id = url.split('/')[-1]
1211 self.report_extraction(video_id)
1213 if re.search(self._LIVE_URL, video_id) is not None:
1214 self.extractLiveStream(url)
1217 info = self.extractPlus7Stream(url)
1222 class GenericIE(InfoExtractor):
1223 """Generic last-resort information extractor."""
1226 IE_NAME = u'generic'
1228 def __init__(self, downloader=None):
1229 InfoExtractor.__init__(self, downloader)
1231 def report_download_webpage(self, video_id):
1232 """Report webpage download."""
1233 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1234 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1236 def report_extraction(self, video_id):
1237 """Report information extraction."""
1238 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1240 def report_following_redirect(self, new_url):
1241 """Report information extraction."""
1242 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1244 def _test_redirect(self, url):
1245 """Check if it is a redirect, like url shorteners, in case restart chain."""
1246 class HeadRequest(compat_urllib_request.Request):
1247 def get_method(self):
1250 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1252 Subclass the HTTPRedirectHandler to make it use our
1253 HeadRequest also on the redirected URL
1255 def redirect_request(self, req, fp, code, msg, headers, newurl):
1256 if code in (301, 302, 303, 307):
1257 newurl = newurl.replace(' ', '%20')
1258 newheaders = dict((k,v) for k,v in req.headers.items()
1259 if k.lower() not in ("content-length", "content-type"))
1260 return HeadRequest(newurl,
1262 origin_req_host=req.get_origin_req_host(),
1265 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1267 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1269 Fallback to GET if HEAD is not allowed (405 HTTP error)
1271 def http_error_405(self, req, fp, code, msg, headers):
1275 newheaders = dict((k,v) for k,v in req.headers.items()
1276 if k.lower() not in ("content-length", "content-type"))
1277 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1279 origin_req_host=req.get_origin_req_host(),
1283 opener = compat_urllib_request.OpenerDirector()
1284 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1285 HTTPMethodFallback, HEADRedirectHandler,
1286 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1287 opener.add_handler(handler())
1289 response = opener.open(HeadRequest(url))
1290 new_url = response.geturl()
1295 self.report_following_redirect(new_url)
1296 self._downloader.download([new_url])
1299 def _real_extract(self, url):
1300 if self._test_redirect(url): return
1302 video_id = url.split('/')[-1]
1303 request = compat_urllib_request.Request(url)
1305 self.report_download_webpage(video_id)
1306 webpage = compat_urllib_request.urlopen(request).read()
1307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1308 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1310 except ValueError as err:
1311 # since this is the last-resort InfoExtractor, if
1312 # this error is thrown, it'll be thrown here
1313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1316 self.report_extraction(video_id)
1317 # Start with something easy: JW Player in SWFObject
1318 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1320 # Broaden the search a little bit
1321 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1323 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1326 # It's possible that one of the regexes
1327 # matched, but returned an empty group:
1328 if mobj.group(1) is None:
1329 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1332 video_url = compat_urllib_parse.unquote(mobj.group(1))
1333 video_id = os.path.basename(video_url)
1335 # here's a fun little line of code for you:
1336 video_extension = os.path.splitext(video_id)[1][1:]
1337 video_id = os.path.splitext(video_id)[0]
1339 # it's tempting to parse this further, but you would
1340 # have to take into account all the variations like
1341 # Video Title - Site Name
1342 # Site Name | Video Title
1343 # Video Title - Tagline | Site Name
1344 # and so on and so forth; it's just not practical
1345 mobj = re.search(r'<title>(.*)</title>', webpage)
1347 self._downloader.trouble(u'ERROR: unable to extract title')
1349 video_title = mobj.group(1)
1351 # video uploader is domain name
1352 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1354 self._downloader.trouble(u'ERROR: unable to extract title')
1356 video_uploader = mobj.group(1)
1361 'uploader': video_uploader,
1362 'upload_date': None,
1363 'title': video_title,
1364 'ext': video_extension,
1368 class YoutubeSearchIE(InfoExtractor):
1369 """Information Extractor for YouTube search queries."""
1370 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1371 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1372 _max_youtube_results = 1000
1373 IE_NAME = u'youtube:search'
1375 def __init__(self, downloader=None):
1376 InfoExtractor.__init__(self, downloader)
1378 def report_download_page(self, query, pagenum):
1379 """Report attempt to download search page with given number."""
1380 query = query.decode(preferredencoding())
1381 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1383 def _real_extract(self, query):
1384 mobj = re.match(self._VALID_URL, query)
1386 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1389 prefix, query = query.split(':')
1391 query = query.encode('utf-8')
1393 self._download_n_results(query, 1)
1395 elif prefix == 'all':
1396 self._download_n_results(query, self._max_youtube_results)
1402 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1404 elif n > self._max_youtube_results:
1405 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1406 n = self._max_youtube_results
1407 self._download_n_results(query, n)
1409 except ValueError: # parsing prefix as integer fails
1410 self._download_n_results(query, 1)
1413 def _download_n_results(self, query, n):
1414 """Downloads a specified number of results for a query"""
1420 while (50 * pagenum) < limit:
1421 self.report_download_page(query, pagenum+1)
1422 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1423 request = compat_urllib_request.Request(result_url)
1425 data = compat_urllib_request.urlopen(request).read()
1426 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1427 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1429 api_response = json.loads(data)['data']
1431 new_ids = list(video['id'] for video in api_response['items'])
1432 video_ids += new_ids
1434 limit = min(n, api_response['totalItems'])
1437 if len(video_ids) > n:
1438 video_ids = video_ids[:n]
1439 for id in video_ids:
1440 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1444 class GoogleSearchIE(InfoExtractor):
1445 """Information Extractor for Google Video search queries."""
1446 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1447 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1448 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1449 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1450 _max_google_results = 1000
1451 IE_NAME = u'video.google:search'
1453 def __init__(self, downloader=None):
1454 InfoExtractor.__init__(self, downloader)
1456 def report_download_page(self, query, pagenum):
1457 """Report attempt to download playlist page with given number."""
1458 query = query.decode(preferredencoding())
1459 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1461 def _real_extract(self, query):
1462 mobj = re.match(self._VALID_URL, query)
1464 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1467 prefix, query = query.split(':')
1469 query = query.encode('utf-8')
1471 self._download_n_results(query, 1)
1473 elif prefix == 'all':
1474 self._download_n_results(query, self._max_google_results)
1480 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1482 elif n > self._max_google_results:
1483 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1484 n = self._max_google_results
1485 self._download_n_results(query, n)
1487 except ValueError: # parsing prefix as integer fails
1488 self._download_n_results(query, 1)
1491 def _download_n_results(self, query, n):
1492 """Downloads a specified number of results for a query"""
1498 self.report_download_page(query, pagenum)
1499 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1500 request = compat_urllib_request.Request(result_url)
1502 page = compat_urllib_request.urlopen(request).read()
1503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1504 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1507 # Extract video identifiers
1508 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1509 video_id = mobj.group(1)
1510 if video_id not in video_ids:
1511 video_ids.append(video_id)
1512 if len(video_ids) == n:
1513 # Specified n videos reached
1514 for id in video_ids:
1515 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1518 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1519 for id in video_ids:
1520 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1523 pagenum = pagenum + 1
1526 class YahooSearchIE(InfoExtractor):
1527 """Information Extractor for Yahoo! Video search queries."""
1530 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1531 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1532 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1533 _MORE_PAGES_INDICATOR = r'\s*Next'
1534 _max_yahoo_results = 1000
1535 IE_NAME = u'video.yahoo:search'
1537 def __init__(self, downloader=None):
1538 InfoExtractor.__init__(self, downloader)
1540 def report_download_page(self, query, pagenum):
1541 """Report attempt to download playlist page with given number."""
1542 query = query.decode(preferredencoding())
1543 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1545 def _real_extract(self, query):
1546 mobj = re.match(self._VALID_URL, query)
1548 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1551 prefix, query = query.split(':')
1553 query = query.encode('utf-8')
1555 self._download_n_results(query, 1)
1557 elif prefix == 'all':
1558 self._download_n_results(query, self._max_yahoo_results)
1564 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1566 elif n > self._max_yahoo_results:
1567 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1568 n = self._max_yahoo_results
1569 self._download_n_results(query, n)
1571 except ValueError: # parsing prefix as integer fails
1572 self._download_n_results(query, 1)
1575 def _download_n_results(self, query, n):
1576 """Downloads a specified number of results for a query"""
1579 already_seen = set()
1583 self.report_download_page(query, pagenum)
1584 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1585 request = compat_urllib_request.Request(result_url)
1587 page = compat_urllib_request.urlopen(request).read()
1588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1589 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1592 # Extract video identifiers
1593 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1594 video_id = mobj.group(1)
1595 if video_id not in already_seen:
1596 video_ids.append(video_id)
1597 already_seen.add(video_id)
1598 if len(video_ids) == n:
1599 # Specified n videos reached
1600 for id in video_ids:
1601 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1604 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1605 for id in video_ids:
1606 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1609 pagenum = pagenum + 1
1612 class YoutubePlaylistIE(InfoExtractor):
1613 """Information Extractor for YouTube playlists."""
1615 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1616 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1617 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1618 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1619 IE_NAME = u'youtube:playlist'
1621 def __init__(self, downloader=None):
1622 InfoExtractor.__init__(self, downloader)
1624 def report_download_page(self, playlist_id, pagenum):
1625 """Report attempt to download playlist page with given number."""
1626 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1628 def _real_extract(self, url):
1629 # Extract playlist id
1630 mobj = re.match(self._VALID_URL, url)
1632 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1636 if mobj.group(3) is not None:
1637 self._downloader.download([mobj.group(3)])
1640 # Download playlist pages
1641 # prefix is 'p' as default for playlists but there are other types that need extra care
1642 playlist_prefix = mobj.group(1)
1643 if playlist_prefix == 'a':
1644 playlist_access = 'artist'
1646 playlist_prefix = 'p'
1647 playlist_access = 'view_play_list'
1648 playlist_id = mobj.group(2)
1653 self.report_download_page(playlist_id, pagenum)
1654 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1655 request = compat_urllib_request.Request(url)
1657 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1658 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1659 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1662 # Extract video identifiers
1664 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1665 if mobj.group(1) not in ids_in_page:
1666 ids_in_page.append(mobj.group(1))
1667 video_ids.extend(ids_in_page)
1669 if self._MORE_PAGES_INDICATOR not in page:
1671 pagenum = pagenum + 1
1673 total = len(video_ids)
1675 playliststart = self._downloader.params.get('playliststart', 1) - 1
1676 playlistend = self._downloader.params.get('playlistend', -1)
1677 if playlistend == -1:
1678 video_ids = video_ids[playliststart:]
1680 video_ids = video_ids[playliststart:playlistend]
1682 if len(video_ids) == total:
1683 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1685 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1687 for id in video_ids:
1688 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1692 class YoutubeChannelIE(InfoExtractor):
1693 """Information Extractor for YouTube channels."""
1695 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1696 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1697 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1698 IE_NAME = u'youtube:channel'
1700 def report_download_page(self, channel_id, pagenum):
1701 """Report attempt to download channel page with given number."""
1702 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1704 def _real_extract(self, url):
1705 # Extract channel id
1706 mobj = re.match(self._VALID_URL, url)
1708 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1711 # Download channel pages
1712 channel_id = mobj.group(1)
1717 self.report_download_page(channel_id, pagenum)
1718 url = self._TEMPLATE_URL % (channel_id, pagenum)
1719 request = compat_urllib_request.Request(url)
1721 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1723 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1726 # Extract video identifiers
1728 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1729 if mobj.group(1) not in ids_in_page:
1730 ids_in_page.append(mobj.group(1))
1731 video_ids.extend(ids_in_page)
1733 if self._MORE_PAGES_INDICATOR not in page:
1735 pagenum = pagenum + 1
1737 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1739 for id in video_ids:
1740 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1744 class YoutubeUserIE(InfoExtractor):
1745 """Information Extractor for YouTube users."""
1747 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1748 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1749 _GDATA_PAGE_SIZE = 50
1750 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1751 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1752 IE_NAME = u'youtube:user'
1754 def __init__(self, downloader=None):
1755 InfoExtractor.__init__(self, downloader)
1757 def report_download_page(self, username, start_index):
1758 """Report attempt to download user page."""
1759 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1760 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1762 def _real_extract(self, url):
1764 mobj = re.match(self._VALID_URL, url)
1766 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1769 username = mobj.group(1)
1771 # Download video ids using YouTube Data API. Result size per
1772 # query is limited (currently to 50 videos) so we need to query
1773 # page by page until there are no video ids - it means we got
1780 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1781 self.report_download_page(username, start_index)
1783 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1786 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1791 # Extract video identifiers
1794 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1795 if mobj.group(1) not in ids_in_page:
1796 ids_in_page.append(mobj.group(1))
1798 video_ids.extend(ids_in_page)
1800 # A little optimization - if current page is not
1801 # "full", ie. does not contain PAGE_SIZE video ids then
1802 # we can assume that this page is the last one - there
1803 # are no more ids on further pages - no need to query
1806 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1811 all_ids_count = len(video_ids)
1812 playliststart = self._downloader.params.get('playliststart', 1) - 1
1813 playlistend = self._downloader.params.get('playlistend', -1)
1815 if playlistend == -1:
1816 video_ids = video_ids[playliststart:]
1818 video_ids = video_ids[playliststart:playlistend]
1820 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1821 (username, all_ids_count, len(video_ids)))
1823 for video_id in video_ids:
1824 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1827 class BlipTVUserIE(InfoExtractor):
1828 """Information Extractor for blip.tv users."""
1830 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1832 IE_NAME = u'blip.tv:user'
1834 def __init__(self, downloader=None):
1835 InfoExtractor.__init__(self, downloader)
1837 def report_download_page(self, username, pagenum):
1838 """Report attempt to download user page."""
1839 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1840 (self.IE_NAME, username, pagenum))
1842 def _real_extract(self, url):
1844 mobj = re.match(self._VALID_URL, url)
1846 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1849 username = mobj.group(1)
1851 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1853 request = compat_urllib_request.Request(url)
1856 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1857 mobj = re.search(r'data-users-id="([^"]+)"', page)
1858 page_base = page_base % mobj.group(1)
1859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1864 # Download video ids using BlipTV Ajax calls. Result size per
1865 # query is limited (currently to 12 videos) so we need to query
1866 # page by page until there are no video ids - it means we got
1873 self.report_download_page(username, pagenum)
1875 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1878 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1879 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1880 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1883 # Extract video identifiers
1886 for mobj in re.finditer(r'href="/([^"]+)"', page):
1887 if mobj.group(1) not in ids_in_page:
1888 ids_in_page.append(unescapeHTML(mobj.group(1)))
1890 video_ids.extend(ids_in_page)
1892 # A little optimization - if current page is not
1893 # "full", ie. does not contain PAGE_SIZE video ids then
1894 # we can assume that this page is the last one - there
1895 # are no more ids on further pages - no need to query
1898 if len(ids_in_page) < self._PAGE_SIZE:
1903 all_ids_count = len(video_ids)
1904 playliststart = self._downloader.params.get('playliststart', 1) - 1
1905 playlistend = self._downloader.params.get('playlistend', -1)
1907 if playlistend == -1:
1908 video_ids = video_ids[playliststart:]
1910 video_ids = video_ids[playliststart:playlistend]
1912 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1913 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1915 for video_id in video_ids:
1916 self._downloader.download([u'http://blip.tv/'+video_id])
1919 class DepositFilesIE(InfoExtractor):
1920 """Information extractor for depositfiles.com"""
1922 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1924 def report_download_webpage(self, file_id):
1925 """Report webpage download."""
1926 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1928 def report_extraction(self, file_id):
1929 """Report information extraction."""
1930 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1932 def _real_extract(self, url):
1933 file_id = url.split('/')[-1]
1934 # Rebuild url in english locale
1935 url = 'http://depositfiles.com/en/files/' + file_id
1937 # Retrieve file webpage with 'Free download' button pressed
1938 free_download_indication = { 'gateway_result' : '1' }
1939 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1941 self.report_download_webpage(file_id)
1942 webpage = compat_urllib_request.urlopen(request).read()
1943 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1947 # Search for the real file URL
1948 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1949 if (mobj is None) or (mobj.group(1) is None):
1950 # Try to figure out reason of the error.
1951 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1952 if (mobj is not None) and (mobj.group(1) is not None):
1953 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1954 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1956 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1959 file_url = mobj.group(1)
1960 file_extension = os.path.splitext(file_url)[1][1:]
1962 # Search for file title
1963 mobj = re.search(r'<b title="(.*?)">', webpage)
1965 self._downloader.trouble(u'ERROR: unable to extract title')
1967 file_title = mobj.group(1).decode('utf-8')
1970 'id': file_id.decode('utf-8'),
1971 'url': file_url.decode('utf-8'),
1973 'upload_date': None,
1974 'title': file_title,
1975 'ext': file_extension.decode('utf-8'),
1979 class FacebookIE(InfoExtractor):
1980 """Information Extractor for Facebook"""
1983 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1984 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1985 _NETRC_MACHINE = 'facebook'
1986 _available_formats = ['video', 'highqual', 'lowqual']
1987 _video_extensions = {
1992 IE_NAME = u'facebook'
1994 def __init__(self, downloader=None):
1995 InfoExtractor.__init__(self, downloader)
1997 def _reporter(self, message):
1998 """Add header and report message."""
1999 self._downloader.to_screen(u'[facebook] %s' % message)
2001 def report_login(self):
2002 """Report attempt to log in."""
2003 self._reporter(u'Logging in')
2005 def report_video_webpage_download(self, video_id):
2006 """Report attempt to download video webpage."""
2007 self._reporter(u'%s: Downloading video webpage' % video_id)
2009 def report_information_extraction(self, video_id):
2010 """Report attempt to extract video information."""
2011 self._reporter(u'%s: Extracting video information' % video_id)
2013 def _parse_page(self, video_webpage):
2014 """Extract video information from page"""
2016 data = {'title': r'\("video_title", "(.*?)"\)',
2017 'description': r'<div class="datawrap">(.*?)</div>',
2018 'owner': r'\("video_owner_name", "(.*?)"\)',
2019 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2022 for piece in data.keys():
2023 mobj = re.search(data[piece], video_webpage)
2024 if mobj is not None:
2025 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2029 for fmt in self._available_formats:
2030 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2031 if mobj is not None:
2032 # URL is in a Javascript segment inside an escaped Unicode format within
2033 # the generally utf-8 page
2034 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2035 video_info['video_urls'] = video_urls
2039 def _real_initialize(self):
2040 if self._downloader is None:
2045 downloader_params = self._downloader.params
2047 # Attempt to use provided username and password or .netrc data
2048 if downloader_params.get('username', None) is not None:
2049 useremail = downloader_params['username']
2050 password = downloader_params['password']
2051 elif downloader_params.get('usenetrc', False):
2053 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2054 if info is not None:
2058 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2059 except (IOError, netrc.NetrcParseError) as err:
2060 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2063 if useremail is None:
2072 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2075 login_results = compat_urllib_request.urlopen(request).read()
2076 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2077 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2079 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2080 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2083 def _real_extract(self, url):
2084 mobj = re.match(self._VALID_URL, url)
2086 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2088 video_id = mobj.group('ID')
2091 self.report_video_webpage_download(video_id)
2092 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2094 page = compat_urllib_request.urlopen(request)
2095 video_webpage = page.read()
2096 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2097 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2100 # Start extracting information
2101 self.report_information_extraction(video_id)
2103 # Extract information
2104 video_info = self._parse_page(video_webpage)
2107 if 'owner' not in video_info:
2108 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2110 video_uploader = video_info['owner']
2113 if 'title' not in video_info:
2114 self._downloader.trouble(u'ERROR: unable to extract video title')
2116 video_title = video_info['title']
2117 video_title = video_title.decode('utf-8')
2120 if 'thumbnail' not in video_info:
2121 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2122 video_thumbnail = ''
2124 video_thumbnail = video_info['thumbnail']
2128 if 'upload_date' in video_info:
2129 upload_time = video_info['upload_date']
2130 timetuple = email.utils.parsedate_tz(upload_time)
2131 if timetuple is not None:
2133 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2138 video_description = video_info.get('description', 'No description available.')
2140 url_map = video_info['video_urls']
2142 # Decide which formats to download
2143 req_format = self._downloader.params.get('format', None)
2144 format_limit = self._downloader.params.get('format_limit', None)
2146 if format_limit is not None and format_limit in self._available_formats:
2147 format_list = self._available_formats[self._available_formats.index(format_limit):]
2149 format_list = self._available_formats
2150 existing_formats = [x for x in format_list if x in url_map]
2151 if len(existing_formats) == 0:
2152 self._downloader.trouble(u'ERROR: no known formats available for video')
2154 if req_format is None:
2155 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2156 elif req_format == 'worst':
2157 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2158 elif req_format == '-1':
2159 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2162 if req_format not in url_map:
2163 self._downloader.trouble(u'ERROR: requested format not available')
2165 video_url_list = [(req_format, url_map[req_format])] # Specific format
2168 for format_param, video_real_url in video_url_list:
2170 video_extension = self._video_extensions.get(format_param, 'mp4')
2173 'id': video_id.decode('utf-8'),
2174 'url': video_real_url.decode('utf-8'),
2175 'uploader': video_uploader.decode('utf-8'),
2176 'upload_date': upload_date,
2177 'title': video_title,
2178 'ext': video_extension.decode('utf-8'),
2179 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2180 'thumbnail': video_thumbnail.decode('utf-8'),
2181 'description': video_description.decode('utf-8'),
2185 class BlipTVIE(InfoExtractor):
2186 """Information extractor for blip.tv"""
2188 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2189 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2190 IE_NAME = u'blip.tv'
2192 def report_extraction(self, file_id):
2193 """Report information extraction."""
2194 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2196 def report_direct_download(self, title):
2197 """Report information extraction."""
2198 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2200 def _real_extract(self, url):
2201 mobj = re.match(self._VALID_URL, url)
2203 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2210 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2211 request = compat_urllib_request.Request(json_url)
2212 self.report_extraction(mobj.group(1))
2215 urlh = compat_urllib_request.urlopen(request)
2216 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2217 basename = url.split('/')[-1]
2218 title,ext = os.path.splitext(basename)
2219 title = title.decode('UTF-8')
2220 ext = ext.replace('.', '')
2221 self.report_direct_download(title)
2226 'upload_date': None,
2231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2232 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2234 if info is None: # Regular URL
2236 json_code_bytes = urlh.read()
2237 json_code = json_code_bytes.decode('utf-8')
2238 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2239 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2243 json_data = json.loads(json_code)
2244 if 'Post' in json_data:
2245 data = json_data['Post']
2249 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2250 video_url = data['media']['url']
2251 umobj = re.match(self._URL_EXT, video_url)
2253 raise ValueError('Can not determine filename extension')
2254 ext = umobj.group(1)
2257 'id': data['item_id'],
2259 'uploader': data['display_name'],
2260 'upload_date': upload_date,
2261 'title': data['title'],
2263 'format': data['media']['mimeType'],
2264 'thumbnail': data['thumbnailUrl'],
2265 'description': data['description'],
2266 'player_url': data['embedUrl']
2268 except (ValueError,KeyError) as err:
2269 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2272 std_headers['User-Agent'] = 'iTunes/10.6.1'
2276 class MyVideoIE(InfoExtractor):
2277 """Information Extractor for myvideo.de."""
2279 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2280 IE_NAME = u'myvideo'
2282 def __init__(self, downloader=None):
2283 InfoExtractor.__init__(self, downloader)
2285 def report_extraction(self, video_id):
2286 """Report information extraction."""
2287 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2289 def _real_extract(self,url):
2290 mobj = re.match(self._VALID_URL, url)
2292 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2295 video_id = mobj.group(1)
2298 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2299 webpage = self._download_webpage(webpage_url, video_id)
2301 self.report_extraction(video_id)
2302 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2305 self._downloader.trouble(u'ERROR: unable to extract media URL')
2307 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2309 mobj = re.search('<title>([^<]+)</title>', webpage)
2311 self._downloader.trouble(u'ERROR: unable to extract title')
2314 video_title = mobj.group(1)
2320 'upload_date': None,
2321 'title': video_title,
2325 class ComedyCentralIE(InfoExtractor):
2326 """Information extractor for The Daily Show and Colbert Report """
2328 # urls can be abbreviations like :thedailyshow or :colbert
2329 # urls for episodes like:
2330 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2331 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2332 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2333 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2334 |(https?://)?(www\.)?
2335 (?P<showname>thedailyshow|colbertnation)\.com/
2336 (full-episodes/(?P<episode>.*)|
2338 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2339 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2342 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2344 _video_extensions = {
2352 _video_dimensions = {
2361 def suitable(self, url):
2362 """Receives a URL and returns True if suitable for this IE."""
2363 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2365 def report_extraction(self, episode_id):
2366 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2368 def report_config_download(self, episode_id, media_id):
2369 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2371 def report_index_download(self, episode_id):
2372 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2374 def _print_formats(self, formats):
2375 print('Available formats:')
2377 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2380 def _real_extract(self, url):
2381 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2383 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2386 if mobj.group('shortname'):
2387 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2388 url = u'http://www.thedailyshow.com/full-episodes/'
2390 url = u'http://www.colbertnation.com/full-episodes/'
2391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 assert mobj is not None
2394 if mobj.group('clip'):
2395 if mobj.group('showname') == 'thedailyshow':
2396 epTitle = mobj.group('tdstitle')
2398 epTitle = mobj.group('cntitle')
2401 dlNewest = not mobj.group('episode')
2403 epTitle = mobj.group('showname')
2405 epTitle = mobj.group('episode')
2407 req = compat_urllib_request.Request(url)
2408 self.report_extraction(epTitle)
2410 htmlHandle = compat_urllib_request.urlopen(req)
2411 html = htmlHandle.read()
2412 webpage = html.decode('utf-8')
2413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2414 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2417 url = htmlHandle.geturl()
2418 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2420 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2422 if mobj.group('episode') == '':
2423 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2425 epTitle = mobj.group('episode')
2427 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2429 if len(mMovieParams) == 0:
2430 # The Colbert Report embeds the information in a without
2431 # a URL prefix; so extract the alternate reference
2432 # and then add the URL prefix manually.
2434 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2435 if len(altMovieParams) == 0:
2436 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2439 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2441 uri = mMovieParams[0][1]
2442 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2443 self.report_index_download(epTitle)
2445 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2452 idoc = xml.etree.ElementTree.fromstring(indexXml)
2453 itemEls = idoc.findall('.//item')
2454 for partNum,itemEl in enumerate(itemEls):
2455 mediaId = itemEl.findall('./guid')[0].text
2456 shortMediaId = mediaId.split(':')[-1]
2457 showId = mediaId.split(':')[-2].replace('.com', '')
2458 officialTitle = itemEl.findall('./title')[0].text
2459 officialDate = itemEl.findall('./pubDate')[0].text
2461 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2462 compat_urllib_parse.urlencode({'uri': mediaId}))
2463 configReq = compat_urllib_request.Request(configUrl)
2464 self.report_config_download(epTitle, shortMediaId)
2466 configXml = compat_urllib_request.urlopen(configReq).read()
2467 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2471 cdoc = xml.etree.ElementTree.fromstring(configXml)
2473 for rendition in cdoc.findall('.//rendition'):
2474 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2478 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2481 if self._downloader.params.get('listformats', None):
2482 self._print_formats([i[0] for i in turls])
2485 # For now, just pick the highest bitrate
2486 format,rtmp_video_url = turls[-1]
2488 # Get the format arg from the arg stream
2489 req_format = self._downloader.params.get('format', None)
2491 # Select format if we can find one
2494 format, rtmp_video_url = f, v
2497 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2499 raise ExtractorError(u'Cannot transform RTMP url')
2500 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2501 video_url = base + m.group('finalid')
2503 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2508 'upload_date': officialDate,
2513 'description': officialTitle,
2515 results.append(info)
2520 class EscapistIE(InfoExtractor):
2521 """Information extractor for The Escapist """
2523 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2524 IE_NAME = u'escapist'
2526 def report_extraction(self, showName):
2527 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2529 def report_config_download(self, showName):
2530 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2535 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2537 showName = mobj.group('showname')
2538 videoId = mobj.group('episode')
2540 self.report_extraction(showName)
2542 webPage = compat_urllib_request.urlopen(url)
2543 webPageBytes = webPage.read()
2544 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2545 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2550 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2551 description = unescapeHTML(descMatch.group(1))
2552 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2553 imgUrl = unescapeHTML(imgMatch.group(1))
2554 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2555 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2556 configUrlMatch = re.search('config=(.*)$', playerUrl)
2557 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2559 self.report_config_download(showName)
2561 configJSON = compat_urllib_request.urlopen(configUrl)
2562 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2563 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2568 # Technically, it's JavaScript, not JSON
2569 configJSON = configJSON.replace("'", '"')
2572 config = json.loads(configJSON)
2573 except (ValueError,) as err:
2574 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2577 playlist = config['playlist']
2578 videoUrl = playlist[1]['url']
2583 'uploader': showName,
2584 'upload_date': None,
2587 'thumbnail': imgUrl,
2588 'description': description,
2589 'player_url': playerUrl,
2594 class CollegeHumorIE(InfoExtractor):
2595 """Information extractor for collegehumor.com"""
2598 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2599 IE_NAME = u'collegehumor'
2601 def report_manifest(self, video_id):
2602 """Report information extraction."""
2603 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2605 def report_extraction(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2609 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2612 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2614 video_id = mobj.group('videoid')
2619 'upload_date': None,
2622 self.report_extraction(video_id)
2623 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2625 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2627 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2630 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2632 videoNode = mdoc.findall('./video')[0]
2633 info['description'] = videoNode.findall('./description')[0].text
2634 info['title'] = videoNode.findall('./caption')[0].text
2635 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2636 manifest_url = videoNode.findall('./file')[0].text
2638 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2641 manifest_url += '?hdcore=2.10.3'
2642 self.report_manifest(video_id)
2644 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2645 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2646 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2649 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2651 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2652 node_id = media_node.attrib['url']
2653 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2654 except IndexError as err:
2655 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2658 url_pr = compat_urllib_parse_urlparse(manifest_url)
2659 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2666 class XVideosIE(InfoExtractor):
2667 """Information extractor for xvideos.com"""
2669 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2670 IE_NAME = u'xvideos'
2672 def report_extraction(self, video_id):
2673 """Report information extraction."""
2674 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2676 def _real_extract(self, url):
2677 mobj = re.match(self._VALID_URL, url)
2679 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2681 video_id = mobj.group(1)
2683 webpage = self._download_webpage(url, video_id)
2685 self.report_extraction(video_id)
2689 mobj = re.search(r'flv_url=(.+?)&', webpage)
2691 self._downloader.trouble(u'ERROR: unable to extract video url')
2693 video_url = compat_urllib_parse.unquote(mobj.group(1))
2697 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2699 self._downloader.trouble(u'ERROR: unable to extract video title')
2701 video_title = mobj.group(1)
2704 # Extract video thumbnail
2705 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2707 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2709 video_thumbnail = mobj.group(0)
2715 'upload_date': None,
2716 'title': video_title,
2718 'thumbnail': video_thumbnail,
2719 'description': None,
2725 class SoundcloudIE(InfoExtractor):
2726 """Information extractor for soundcloud.com
2727 To access the media, the uid of the song and a stream token
2728 must be extracted from the page source and the script must make
2729 a request to media.soundcloud.com/crossdomain.xml. Then
2730 the media can be grabbed by requesting from an url composed
2731 of the stream token and uid
2734 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2735 IE_NAME = u'soundcloud'
2737 def __init__(self, downloader=None):
2738 InfoExtractor.__init__(self, downloader)
2740 def report_resolve(self, video_id):
2741 """Report information extraction."""
2742 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2744 def report_extraction(self, video_id):
2745 """Report information extraction."""
2746 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2748 def _real_extract(self, url):
2749 mobj = re.match(self._VALID_URL, url)
2751 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2754 # extract uploader (which is in the url)
2755 uploader = mobj.group(1)
2756 # extract simple title (uploader + slug of song title)
2757 slug_title = mobj.group(2)
2758 simple_title = uploader + u'-' + slug_title
2760 self.report_resolve('%s/%s' % (uploader, slug_title))
2762 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2763 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2764 request = compat_urllib_request.Request(resolv_url)
2766 info_json_bytes = compat_urllib_request.urlopen(request).read()
2767 info_json = info_json_bytes.decode('utf-8')
2768 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2769 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2772 info = json.loads(info_json)
2773 video_id = info['id']
2774 self.report_extraction('%s/%s' % (uploader, slug_title))
2776 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2777 request = compat_urllib_request.Request(streams_url)
2779 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2780 stream_json = stream_json_bytes.decode('utf-8')
2781 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2785 streams = json.loads(stream_json)
2786 mediaURL = streams['http_mp3_128_url']
2791 'uploader': info['user']['username'],
2792 'upload_date': info['created_at'],
2793 'title': info['title'],
2795 'description': info['description'],
2799 class InfoQIE(InfoExtractor):
2800 """Information extractor for infoq.com"""
2801 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2803 def report_extraction(self, video_id):
2804 """Report information extraction."""
2805 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2807 def _real_extract(self, url):
2808 mobj = re.match(self._VALID_URL, url)
2810 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2813 webpage = self._download_webpage(url, video_id=url)
2814 self.report_extraction(url)
2817 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2819 self._downloader.trouble(u'ERROR: unable to extract video url')
2821 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2822 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2825 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2827 self._downloader.trouble(u'ERROR: unable to extract video title')
2829 video_title = mobj.group(1)
2831 # Extract description
2832 video_description = u'No description available.'
2833 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2834 if mobj is not None:
2835 video_description = mobj.group(1)
2837 video_filename = video_url.split('/')[-1]
2838 video_id, extension = video_filename.split('.')
2844 'upload_date': None,
2845 'title': video_title,
2846 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2848 'description': video_description,
2853 class MixcloudIE(InfoExtractor):
2854 """Information extractor for www.mixcloud.com"""
2856 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2857 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2858 IE_NAME = u'mixcloud'
2860 def __init__(self, downloader=None):
2861 InfoExtractor.__init__(self, downloader)
2863 def report_download_json(self, file_id):
2864 """Report JSON download."""
2865 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2867 def report_extraction(self, file_id):
2868 """Report information extraction."""
2869 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2871 def get_urls(self, jsonData, fmt, bitrate='best'):
2872 """Get urls from 'audio_formats' section in json"""
2875 bitrate_list = jsonData[fmt]
2876 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2877 bitrate = max(bitrate_list) # select highest
2879 url_list = jsonData[fmt][bitrate]
2880 except TypeError: # we have no bitrate info.
2881 url_list = jsonData[fmt]
2884 def check_urls(self, url_list):
2885 """Returns 1st active url from list"""
2886 for url in url_list:
2888 compat_urllib_request.urlopen(url)
2890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895 def _print_formats(self, formats):
2896 print('Available formats:')
2897 for fmt in formats.keys():
2898 for b in formats[fmt]:
2900 ext = formats[fmt][b][0]
2901 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2902 except TypeError: # we have no bitrate info
2903 ext = formats[fmt][0]
2904 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2907 def _real_extract(self, url):
2908 mobj = re.match(self._VALID_URL, url)
2910 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2912 # extract uploader & filename from url
2913 uploader = mobj.group(1).decode('utf-8')
2914 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2916 # construct API request
2917 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2918 # retrieve .json file with links to files
2919 request = compat_urllib_request.Request(file_url)
2921 self.report_download_json(file_url)
2922 jsonData = compat_urllib_request.urlopen(request).read()
2923 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2924 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2928 json_data = json.loads(jsonData)
2929 player_url = json_data['player_swf_url']
2930 formats = dict(json_data['audio_formats'])
2932 req_format = self._downloader.params.get('format', None)
2935 if self._downloader.params.get('listformats', None):
2936 self._print_formats(formats)
2939 if req_format is None or req_format == 'best':
2940 for format_param in formats.keys():
2941 url_list = self.get_urls(formats, format_param)
2943 file_url = self.check_urls(url_list)
2944 if file_url is not None:
2947 if req_format not in formats:
2948 self._downloader.trouble(u'ERROR: format is not available')
2951 url_list = self.get_urls(formats, req_format)
2952 file_url = self.check_urls(url_list)
2953 format_param = req_format
2956 'id': file_id.decode('utf-8'),
2957 'url': file_url.decode('utf-8'),
2958 'uploader': uploader.decode('utf-8'),
2959 'upload_date': None,
2960 'title': json_data['name'],
2961 'ext': file_url.split('.')[-1].decode('utf-8'),
2962 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2963 'thumbnail': json_data['thumbnail_url'],
2964 'description': json_data['description'],
2965 'player_url': player_url.decode('utf-8'),
2968 class StanfordOpenClassroomIE(InfoExtractor):
2969 """Information extractor for Stanford's Open ClassRoom"""
2971 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2972 IE_NAME = u'stanfordoc'
2974 def report_download_webpage(self, objid):
2975 """Report information extraction."""
2976 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2978 def report_extraction(self, video_id):
2979 """Report information extraction."""
2980 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2982 def _real_extract(self, url):
2983 mobj = re.match(self._VALID_URL, url)
2985 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2988 if mobj.group('course') and mobj.group('video'): # A specific video
2989 course = mobj.group('course')
2990 video = mobj.group('video')
2992 'id': course + '_' + video,
2994 'upload_date': None,
2997 self.report_extraction(info['id'])
2998 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2999 xmlUrl = baseUrl + video + '.xml'
3001 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3003 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3005 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3007 info['title'] = mdoc.findall('./title')[0].text
3008 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3010 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3012 info['ext'] = info['url'].rpartition('.')[2]
3014 elif mobj.group('course'): # A course page
3015 course = mobj.group('course')
3020 'upload_date': None,
3023 self.report_download_webpage(info['id'])
3025 coursepage = compat_urllib_request.urlopen(url).read()
3026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3030 m = re.search('<h1>([^<]+)</h1>', coursepage)
3032 info['title'] = unescapeHTML(m.group(1))
3034 info['title'] = info['id']
3036 m = re.search('<description>([^<]+)</description>', coursepage)
3038 info['description'] = unescapeHTML(m.group(1))
3040 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3043 'type': 'reference',
3044 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3048 for entry in info['list']:
3049 assert entry['type'] == 'reference'
3050 results += self.extract(entry['url'])
3055 'id': 'Stanford OpenClassroom',
3058 'upload_date': None,
3061 self.report_download_webpage(info['id'])
3062 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3064 rootpage = compat_urllib_request.urlopen(rootURL).read()
3065 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3066 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3069 info['title'] = info['id']
3071 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3074 'type': 'reference',
3075 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3080 for entry in info['list']:
3081 assert entry['type'] == 'reference'
3082 results += self.extract(entry['url'])
3085 class MTVIE(InfoExtractor):
3086 """Information extractor for MTV.com"""
3088 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3091 def report_extraction(self, video_id):
3092 """Report information extraction."""
3093 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3095 def _real_extract(self, url):
3096 mobj = re.match(self._VALID_URL, url)
3098 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3100 if not mobj.group('proto'):
3101 url = 'http://' + url
3102 video_id = mobj.group('videoid')
3104 webpage = self._download_webpage(url, video_id)
3106 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3108 self._downloader.trouble(u'ERROR: unable to extract song name')
3110 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3111 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3113 self._downloader.trouble(u'ERROR: unable to extract performer')
3115 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3116 video_title = performer + ' - ' + song_name
3118 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3120 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3122 mtvn_uri = mobj.group(1)
3124 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3126 self._downloader.trouble(u'ERROR: unable to extract content id')
3128 content_id = mobj.group(1)
3130 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3131 self.report_extraction(video_id)
3132 request = compat_urllib_request.Request(videogen_url)
3134 metadataXml = compat_urllib_request.urlopen(request).read()
3135 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3136 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3139 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3140 renditions = mdoc.findall('.//rendition')
3142 # For now, always pick the highest quality.
3143 rendition = renditions[-1]
3146 _,_,ext = rendition.attrib['type'].partition('/')
3147 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3148 video_url = rendition.find('./src').text
3150 self._downloader.trouble('Invalid rendition field.')
3156 'uploader': performer,
3157 'upload_date': None,
3158 'title': video_title,
3166 class YoukuIE(InfoExtractor):
3167 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3169 def report_download_webpage(self, file_id):
3170 """Report webpage download."""
3171 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3173 def report_extraction(self, file_id):
3174 """Report information extraction."""
3175 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3178 nowTime = int(time.time() * 1000)
3179 random1 = random.randint(1000,1998)
3180 random2 = random.randint(1000,9999)
3182 return "%d%d%d" %(nowTime,random1,random2)
3184 def _get_file_ID_mix_string(self, seed):
3186 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3188 for i in range(len(source)):
3189 seed = (seed * 211 + 30031 ) % 65536
3190 index = math.floor(seed / 65536 * len(source) )
3191 mixed.append(source[int(index)])
3192 source.remove(source[int(index)])
3193 #return ''.join(mixed)
3196 def _get_file_id(self, fileId, seed):
3197 mixed = self._get_file_ID_mix_string(seed)
3198 ids = fileId.split('*')
3202 realId.append(mixed[int(ch)])
3203 return ''.join(realId)
3205 def _real_extract(self, url):
3206 mobj = re.match(self._VALID_URL, url)
3208 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3210 video_id = mobj.group('ID')
3212 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3214 request = compat_urllib_request.Request(info_url, None, std_headers)
3216 self.report_download_webpage(video_id)
3217 jsondata = compat_urllib_request.urlopen(request).read()
3218 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3219 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3222 self.report_extraction(video_id)
3224 jsonstr = jsondata.decode('utf-8')
3225 config = json.loads(jsonstr)
3227 video_title = config['data'][0]['title']
3228 seed = config['data'][0]['seed']
3230 format = self._downloader.params.get('format', None)
3231 supported_format = list(config['data'][0]['streamfileids'].keys())
3233 if format is None or format == 'best':
3234 if 'hd2' in supported_format:
3239 elif format == 'worst':
3247 fileid = config['data'][0]['streamfileids'][format]
3248 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3249 except (UnicodeDecodeError, ValueError, KeyError):
3250 self._downloader.trouble(u'ERROR: unable to extract info section')
3254 sid = self._gen_sid()
3255 fileid = self._get_file_id(fileid, seed)
3257 #column 8,9 of fileid represent the segment number
3258 #fileid[7:9] should be changed
3259 for index, key in enumerate(keys):
3261 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3262 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3265 'id': '%s_part%02d' % (video_id, index),
3266 'url': download_url,
3268 'upload_date': None,
3269 'title': video_title,
3272 files_info.append(info)
3277 class XNXXIE(InfoExtractor):
3278 """Information extractor for xnxx.com"""
3280 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3282 VIDEO_URL_RE = r'flv_url=(.*?)&'
3283 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3284 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3286 def report_webpage(self, video_id):
3287 """Report information extraction"""
3288 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3290 def report_extraction(self, video_id):
3291 """Report information extraction"""
3292 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3294 def _real_extract(self, url):
3295 mobj = re.match(self._VALID_URL, url)
3297 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3299 video_id = mobj.group(1)
3301 self.report_webpage(video_id)
3303 # Get webpage content
3305 webpage_bytes = compat_urllib_request.urlopen(url).read()
3306 webpage = webpage_bytes.decode('utf-8')
3307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3308 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3311 result = re.search(self.VIDEO_URL_RE, webpage)
3313 self._downloader.trouble(u'ERROR: unable to extract video url')
3315 video_url = compat_urllib_parse.unquote(result.group(1))
3317 result = re.search(self.VIDEO_TITLE_RE, webpage)
3319 self._downloader.trouble(u'ERROR: unable to extract video title')
3321 video_title = result.group(1)
3323 result = re.search(self.VIDEO_THUMB_RE, webpage)
3325 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3327 video_thumbnail = result.group(1)
3333 'upload_date': None,
3334 'title': video_title,
3336 'thumbnail': video_thumbnail,
3337 'description': None,
3341 class GooglePlusIE(InfoExtractor):
3342 """Information extractor for plus.google.com."""
3344 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3345 IE_NAME = u'plus.google'
3347 def __init__(self, downloader=None):
3348 InfoExtractor.__init__(self, downloader)
3350 def report_extract_entry(self, url):
3351 """Report downloading extry"""
3352 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3354 def report_date(self, upload_date):
3355 """Report downloading extry"""
3356 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3358 def report_uploader(self, uploader):
3359 """Report downloading extry"""
3360 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3362 def report_title(self, video_title):
3363 """Report downloading extry"""
3364 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3366 def report_extract_vid_page(self, video_page):
3367 """Report information extraction."""
3368 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3370 def _real_extract(self, url):
3371 # Extract id from URL
3372 mobj = re.match(self._VALID_URL, url)
3374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3377 post_url = mobj.group(0)
3378 video_id = mobj.group(1)
3380 video_extension = 'flv'
3382 # Step 1, Retrieve post webpage to extract further information
3383 self.report_extract_entry(post_url)
3384 request = compat_urllib_request.Request(post_url)
3386 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3388 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3391 # Extract update date
3393 pattern = 'title="Timestamp">(.*?)</a>'
3394 mobj = re.search(pattern, webpage)
3396 upload_date = mobj.group(1)
3397 # Convert timestring to a format suitable for filename
3398 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3399 upload_date = upload_date.strftime('%Y%m%d')
3400 self.report_date(upload_date)
3404 pattern = r'rel\="author".*?>(.*?)</a>'
3405 mobj = re.search(pattern, webpage)
3407 uploader = mobj.group(1)
3408 self.report_uploader(uploader)
3411 # Get the first line for title
3413 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3414 mobj = re.search(pattern, webpage)
3416 video_title = mobj.group(1)
3417 self.report_title(video_title)
3419 # Step 2, Stimulate clicking the image box to launch video
3420 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3421 mobj = re.search(pattern, webpage)
3423 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3425 video_page = mobj.group(1)
3426 request = compat_urllib_request.Request(video_page)
3428 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3429 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3430 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3432 self.report_extract_vid_page(video_page)
3435 # Extract video links on video page
3436 """Extract video links of all sizes"""
3437 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3438 mobj = re.findall(pattern, webpage)
3440 self._downloader.trouble(u'ERROR: unable to extract video links')
3442 # Sort in resolution
3443 links = sorted(mobj)
3445 # Choose the lowest of the sort, i.e. highest resolution
3446 video_url = links[-1]
3447 # Only get the url. The resolution part in the tuple has no use anymore
3448 video_url = video_url[-1]
3449 # Treat escaped \u0026 style hex
3451 video_url = video_url.decode("unicode_escape")
3452 except AttributeError: # Python 3
3453 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3459 'uploader': uploader,
3460 'upload_date': upload_date,
3461 'title': video_title,
3462 'ext': video_extension,
3465 class NBAIE(InfoExtractor):
3466 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3469 def _real_extract(self, url):
3470 mobj = re.match(self._VALID_URL, url)
3472 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3475 video_id = mobj.group(1)
3476 if video_id.endswith('/index.html'):
3477 video_id = video_id[:-len('/index.html')]
3479 webpage = self._download_webpage(url, video_id)
3481 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3482 def _findProp(rexp, default=None):
3483 m = re.search(rexp, webpage)
3485 return unescapeHTML(m.group(1))
3489 shortened_video_id = video_id.rpartition('/')[2]
3490 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3492 'id': shortened_video_id,
3496 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3497 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3501 class JustinTVIE(InfoExtractor):
3502 """Information extractor for justin.tv and twitch.tv"""
3503 # TODO: One broadcast may be split into multiple videos. The key
3504 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3505 # starts at 1 and increases. Can we treat all parts as one video?
3507 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3508 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3509 _JUSTIN_PAGE_LIMIT = 100
3510 IE_NAME = u'justin.tv'
3512 def report_extraction(self, file_id):
3513 """Report information extraction."""
3514 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3516 def report_download_page(self, channel, offset):
3517 """Report attempt to download a single page of videos."""
3518 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3519 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3521 # Return count of items, list of *valid* items
3522 def _parse_page(self, url):
3524 urlh = compat_urllib_request.urlopen(url)
3525 webpage_bytes = urlh.read()
3526 webpage = webpage_bytes.decode('utf-8', 'ignore')
3527 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3528 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3531 response = json.loads(webpage)
3532 if type(response) != list:
3533 error_text = response.get('error', 'unknown error')
3534 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3537 for clip in response:
3538 video_url = clip['video_file_url']
3540 video_extension = os.path.splitext(video_url)[1][1:]
3541 video_date = re.sub('-', '', clip['start_time'][:10])
3542 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3546 'title': clip['title'],
3547 'uploader': clip.get('channel_name', video_uploader_id),
3548 'uploader_id': video_uploader_id,
3549 'upload_date': video_date,
3550 'ext': video_extension,
3552 return (len(response), info)
3554 def _real_extract(self, url):
3555 mobj = re.match(self._VALID_URL, url)
3557 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3560 api = 'http://api.justin.tv'
3561 video_id = mobj.group(mobj.lastindex)
3563 if mobj.lastindex == 1:
3565 api += '/channel/archives/%s.json'
3567 api += '/broadcast/by_archive/%s.json'
3568 api = api % (video_id,)
3570 self.report_extraction(video_id)
3574 limit = self._JUSTIN_PAGE_LIMIT
3577 self.report_download_page(video_id, offset)
3578 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3579 page_count, page_info = self._parse_page(page_url)
3580 info.extend(page_info)
3581 if not paged or page_count != limit:
3586 class FunnyOrDieIE(InfoExtractor):
3587 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3589 def _real_extract(self, url):
3590 mobj = re.match(self._VALID_URL, url)
3592 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3595 video_id = mobj.group('id')
3596 webpage = self._download_webpage(url, video_id)
3598 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3600 self._downloader.trouble(u'ERROR: unable to find video information')
3601 video_url = unescapeHTML(m.group('url'))
3603 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3605 self._downloader.trouble(u'Cannot find video title')
3606 title = unescapeHTML(m.group('title'))
3608 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3610 desc = unescapeHTML(m.group('desc'))
3619 'description': desc,
3623 class TweetReelIE(InfoExtractor):
3624 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3626 def _real_extract(self, url):
3627 mobj = re.match(self._VALID_URL, url)
3629 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3632 video_id = mobj.group('id')
3633 webpage = self._download_webpage(url, video_id)
3635 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3637 self._downloader.trouble(u'ERROR: Cannot find status ID')
3638 status_id = m.group(1)
3640 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3642 self._downloader.trouble(u'WARNING: Cannot find description')
3643 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3645 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3647 self._downloader.trouble(u'ERROR: Cannot find uploader')
3648 uploader = unescapeHTML(m.group('uploader'))
3649 uploader_id = unescapeHTML(m.group('uploader_id'))
3651 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3653 self._downloader.trouble(u'ERROR: Cannot find upload date')
3654 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3657 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3664 'description': desc,
3665 'uploader': uploader,
3666 'uploader_id': uploader_id,
3667 'internal_id': status_id,
3668 'upload_date': upload_date
3672 class SteamIE(InfoExtractor):
3673 _VALID_URL = r"""http://store.steampowered.com/
3674 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3676 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3679 def suitable(self, url):
3680 """Receives a URL and returns True if suitable for this IE."""
3681 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3683 def _real_extract(self, url):
3684 m = re.match(self._VALID_URL, url, re.VERBOSE)
3685 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3686 gameID = m.group('gameID')
3687 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3688 webpage = self._download_webpage(videourl, gameID)
3689 mweb = re.finditer(urlRE, webpage)
3690 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3691 titles = re.finditer(namesRE, webpage)
3693 for vid,vtitle in zip(mweb,titles):
3694 video_id = vid.group('videoID')
3695 title = vtitle.group('videoName')
3696 video_url = vid.group('videoURL')
3698 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3703 'title': unescapeHTML(title)
3708 class UstreamIE(InfoExtractor):
3709 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3710 IE_NAME = u'ustream'
3712 def _real_extract(self, url):
3713 m = re.match(self._VALID_URL, url)
3714 video_id = m.group('videoID')
3715 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3716 webpage = self._download_webpage(url, video_id)
3717 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3718 title = m.group('title')
3719 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3720 uploader = m.group('uploader')
3726 'uploader': uploader
3732 class YouPornIE(InfoExtractor):
3733 """Information extractor for youporn.com."""
3734 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3736 def _print_formats(self, formats):
3737 """Print all available formats"""
3738 print(u'Available formats:')
3739 print(u'ext\t\tformat')
3740 print(u'---------------------------------')
3741 for format in formats:
3742 print(u'%s\t\t%s' % (format['ext'], format['format']))
3744 def _specific(self, req_format, formats):
3746 if(x["format"]==req_format):
3750 def _real_extract(self, url):
3751 mobj = re.match(self._VALID_URL, url)
3753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3756 video_id = mobj.group('videoid')
3758 req = compat_urllib_request.Request(url)
3759 req.add_header('Cookie', 'age_verified=1')
3760 webpage = self._download_webpage(req, video_id)
3762 # Get the video title
3763 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3765 raise ExtractorError(u'ERROR: unable to extract video title')
3766 video_title = result.group('title').strip()
3768 # Get the video date
3769 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3771 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3774 upload_date = result.group('date').strip()
3776 # Get the video uploader
3777 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3779 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3780 video_uploader = None
3782 video_uploader = result.group('uploader').strip()
3783 video_uploader = clean_html( video_uploader )
3785 # Get all of the formats available
3786 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3787 result = re.search(DOWNLOAD_LIST_RE, webpage)
3789 raise ExtractorError(u'Unable to extract download list')
3790 download_list_html = result.group('download_list').strip()
3792 # Get all of the links from the page
3793 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3794 links = re.findall(LINK_RE, download_list_html)
3795 if(len(links) == 0):
3796 raise ExtractorError(u'ERROR: no known formats available for video')
3798 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3803 # A link looks like this:
3804 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3805 # A path looks like this:
3806 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3807 video_url = unescapeHTML( link )
3808 path = compat_urllib_parse_urlparse( video_url ).path
3809 extension = os.path.splitext( path )[1][1:]
3810 format = path.split('/')[4].split('_')[:2]
3813 format = "-".join( format )
3814 title = u'%s-%s-%s' % (video_title, size, bitrate)
3819 'uploader': video_uploader,
3820 'upload_date': upload_date,
3825 'description': None,
3829 if self._downloader.params.get('listformats', None):
3830 self._print_formats(formats)
3833 req_format = self._downloader.params.get('format', None)
3834 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3836 if req_format is None or req_format == 'best':
3838 elif req_format == 'worst':
3839 return [formats[-1]]
3840 elif req_format in ('-1', 'all'):
3843 format = self._specific( req_format, formats )
3845 self._downloader.trouble(u'ERROR: requested format not available')
3851 class PornotubeIE(InfoExtractor):
3852 """Information extractor for pornotube.com."""
3853 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3855 def _real_extract(self, url):
3856 mobj = re.match(self._VALID_URL, url)
3858 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3861 video_id = mobj.group('videoid')
3862 video_title = mobj.group('title')
3864 # Get webpage content
3865 webpage = self._download_webpage(url, video_id)
3868 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3869 result = re.search(VIDEO_URL_RE, webpage)
3871 self._downloader.trouble(u'ERROR: unable to extract video url')
3873 video_url = compat_urllib_parse.unquote(result.group('url'))
3875 #Get the uploaded date
3876 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3877 result = re.search(VIDEO_UPLOADED_RE, webpage)
3879 self._downloader.trouble(u'ERROR: unable to extract video title')
3881 upload_date = result.group('date')
3883 info = {'id': video_id,
3886 'upload_date': upload_date,
3887 'title': video_title,
3895 class YouJizzIE(InfoExtractor):
3896 """Information extractor for youjizz.com."""
3897 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3899 def __init__(self, downloader=None):
3900 InfoExtractor.__init__(self, downloader)
3902 def _real_extract(self, url):
3903 mobj = re.match(self._VALID_URL, url)
3905 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3908 video_id = mobj.group('videoid')
3910 # Get webpage content
3911 webpage = self._download_webpage(url, video_id)
3913 # Get the video title
3914 VIDEO_TITLE_RE = r'<title>(?P<title>.*)</title>'
3915 result = re.search(VIDEO_TITLE_RE, webpage)
3917 self._downloader.trouble(u'ERROR: unable to extract video title')
3919 video_title = result.group('title').strip()
3921 # Get the embed page
3922 EMBED_PAGE_RE = r'http://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)'
3923 result = re.search(EMBED_PAGE_RE, webpage)
3925 self._downloader.trouble(u'ERROR: unable to extract embed page')
3928 embed_page_url = result.group(0).strip()
3929 video_id = result.group('videoid')
3931 webpage = self._download_webpage(embed_page_url, video_id)
3934 SOURCE_RE = r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);'
3935 result = re.search(SOURCE_RE, webpage)
3937 self._downloader.trouble(u'ERROR: unable to extract video url')
3939 video_url = result.group('source')
3941 info = {'id': video_id,
3944 'upload_date': None,
3945 'title': video_title,
3949 'description': None,
3950 'player_url': embed_page_url}
3955 def gen_extractors():
3956 """ Return a list of an instance of every supported extractor.
3957 The order does matter; the first extractor matched is the one handling the URL.
3960 YoutubePlaylistIE(),
3984 StanfordOpenClassroomIE(),