2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
269 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
271 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
273 return (u'WARNING: unable to download video subtitles', None)
274 return (None, self._closed_captions_xml_to_srt(srt_xml))
276 def _print_formats(self, formats):
277 print('Available formats:')
279 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
281 def _real_initialize(self):
282 if self._downloader is None:
287 downloader_params = self._downloader.params
289 # Attempt to use provided username and password or .netrc data
290 if downloader_params.get('username', None) is not None:
291 username = downloader_params['username']
292 password = downloader_params['password']
293 elif downloader_params.get('usenetrc', False):
295 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
300 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
301 except (IOError, netrc.NetrcParseError) as err:
302 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
306 request = compat_urllib_request.Request(self._LANG_URL)
309 compat_urllib_request.urlopen(request).read()
310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
311 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
314 # No authentication to be performed
320 'current_form': 'loginForm',
322 'action_login': 'Log In',
323 'username': username,
324 'password': password,
326 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
329 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
330 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
331 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
333 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
334 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
340 'action_confirm': 'Confirm',
342 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
344 self.report_age_confirmation()
345 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
350 def _extract_id(self, url):
351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
353 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
355 video_id = mobj.group(2)
358 def _real_extract(self, url):
359 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
360 mobj = re.search(self._NEXT_URL_RE, url)
362 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
363 video_id = self._extract_id(url)
366 self.report_video_webpage_download(video_id)
367 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
368 request = compat_urllib_request.Request(url)
370 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
375 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
377 # Attempt to extract SWF player URL
378 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
380 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
385 self.report_video_info_webpage_download(video_id)
386 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
387 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
388 % (video_id, el_type))
389 request = compat_urllib_request.Request(video_info_url)
391 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
392 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
393 video_info = compat_parse_qs(video_info_webpage)
394 if 'token' in video_info:
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
399 if 'token' not in video_info:
400 if 'reason' in video_info:
401 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
403 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
406 # Check for "rental" videos
407 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
408 self._downloader.trouble(u'ERROR: "rental" videos not supported')
411 # Start extracting information
412 self.report_information_extraction(video_id)
415 if 'author' not in video_info:
416 self._downloader.trouble(u'ERROR: unable to extract uploader name')
418 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
421 video_uploader_id = None
422 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
424 video_uploader_id = mobj.group(1)
426 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
429 if 'title' not in video_info:
430 self._downloader.trouble(u'ERROR: unable to extract video title')
432 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
435 if 'thumbnail_url' not in video_info:
436 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
438 else: # don't panic if we can't find it
439 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
443 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
445 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
446 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
447 for expression in format_expressions:
449 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
454 video_description = get_element_by_id("eow-description", video_webpage)
455 if video_description:
456 video_description = clean_html(video_description)
458 video_description = ''
461 video_subtitles = None
462 if self._downloader.params.get('writesubtitles', False):
463 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
465 self._downloader.trouble(srt_error)
467 if 'length_seconds' not in video_info:
468 self._downloader.trouble(u'WARNING: unable to extract video duration')
471 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
474 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
476 # Decide which formats to download
477 req_format = self._downloader.params.get('format', None)
479 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
480 self.report_rtmp_download()
481 video_url_list = [(None, video_info['conn'][0])]
482 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
483 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
484 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
485 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
486 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
488 format_limit = self._downloader.params.get('format_limit', None)
489 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
490 if format_limit is not None and format_limit in available_formats:
491 format_list = available_formats[available_formats.index(format_limit):]
493 format_list = available_formats
494 existing_formats = [x for x in format_list if x in url_map]
495 if len(existing_formats) == 0:
496 self._downloader.trouble(u'ERROR: no known formats available for video')
498 if self._downloader.params.get('listformats', None):
499 self._print_formats(existing_formats)
501 if req_format is None or req_format == 'best':
502 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
503 elif req_format == 'worst':
504 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
505 elif req_format in ('-1', 'all'):
506 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
508 # Specific formats. We pick the first in a slash-delimeted sequence.
509 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
510 req_formats = req_format.split('/')
511 video_url_list = None
512 for rf in req_formats:
514 video_url_list = [(rf, url_map[rf])]
516 if video_url_list is None:
517 self._downloader.trouble(u'ERROR: requested format not available')
520 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
524 for format_param, video_real_url in video_url_list:
526 video_extension = self._video_extensions.get(format_param, 'flv')
528 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
529 self._video_dimensions.get(format_param, '???'))
533 'url': video_real_url,
534 'uploader': video_uploader,
535 'uploader_id': video_uploader_id,
536 'upload_date': upload_date,
537 'title': video_title,
538 'ext': video_extension,
539 'format': video_format,
540 'thumbnail': video_thumbnail,
541 'description': video_description,
542 'player_url': player_url,
543 'subtitles': video_subtitles,
544 'duration': video_duration
549 class MetacafeIE(InfoExtractor):
550 """Information Extractor for metacafe.com."""
552 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
553 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
554 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
555 IE_NAME = u'metacafe'
557 def __init__(self, downloader=None):
558 InfoExtractor.__init__(self, downloader)
560 def report_disclaimer(self):
561 """Report disclaimer retrieval."""
562 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
564 def report_age_confirmation(self):
565 """Report attempt to confirm age."""
566 self._downloader.to_screen(u'[metacafe] Confirming age')
568 def report_download_webpage(self, video_id):
569 """Report webpage download."""
570 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
572 def report_extraction(self, video_id):
573 """Report information extraction."""
574 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
576 def _real_initialize(self):
577 # Retrieve disclaimer
578 request = compat_urllib_request.Request(self._DISCLAIMER)
580 self.report_disclaimer()
581 disclaimer = compat_urllib_request.urlopen(request).read()
582 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
583 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
589 'submit': "Continue - I'm over 18",
591 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
593 self.report_age_confirmation()
594 disclaimer = compat_urllib_request.urlopen(request).read()
595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
596 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
599 def _real_extract(self, url):
600 # Extract id and simplified title from URL
601 mobj = re.match(self._VALID_URL, url)
603 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
606 video_id = mobj.group(1)
608 # Check if video comes from YouTube
609 mobj2 = re.match(r'^yt-(.*)$', video_id)
610 if mobj2 is not None:
611 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
614 # Retrieve video webpage to extract further information
615 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
617 self.report_download_webpage(video_id)
618 webpage = compat_urllib_request.urlopen(request).read()
619 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
620 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
623 # Extract URL, uploader and title from webpage
624 self.report_extraction(video_id)
625 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
627 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
628 video_extension = mediaURL[-3:]
630 # Extract gdaKey if available
631 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
635 gdaKey = mobj.group(1)
636 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
638 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
640 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 vardict = compat_parse_qs(mobj.group(1))
643 if 'mediaData' not in vardict:
644 self._downloader.trouble(u'ERROR: unable to extract media URL')
646 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
648 self._downloader.trouble(u'ERROR: unable to extract media URL')
650 mediaURL = mobj.group(1).replace('\\/', '/')
651 video_extension = mediaURL[-3:]
652 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
654 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
656 self._downloader.trouble(u'ERROR: unable to extract title')
658 video_title = mobj.group(1).decode('utf-8')
660 mobj = re.search(r'submitter=(.*?);', webpage)
662 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
664 video_uploader = mobj.group(1)
667 'id': video_id.decode('utf-8'),
668 'url': video_url.decode('utf-8'),
669 'uploader': video_uploader.decode('utf-8'),
671 'title': video_title,
672 'ext': video_extension.decode('utf-8'),
676 class DailymotionIE(InfoExtractor):
677 """Information Extractor for Dailymotion"""
679 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
680 IE_NAME = u'dailymotion'
682 def __init__(self, downloader=None):
683 InfoExtractor.__init__(self, downloader)
685 def report_extraction(self, video_id):
686 """Report information extraction."""
687 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
689 def _real_extract(self, url):
690 # Extract id and simplified title from URL
691 mobj = re.match(self._VALID_URL, url)
693 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
696 video_id = mobj.group(1).split('_')[0].split('?')[0]
698 video_extension = 'mp4'
700 # Retrieve video webpage to extract further information
701 request = compat_urllib_request.Request(url)
702 request.add_header('Cookie', 'family_filter=off')
703 webpage = self._download_webpage(request, video_id)
705 # Extract URL, uploader and title from webpage
706 self.report_extraction(video_id)
707 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
709 self._downloader.trouble(u'ERROR: unable to extract media URL')
711 flashvars = compat_urllib_parse.unquote(mobj.group(1))
713 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
716 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
719 self._downloader.trouble(u'ERROR: unable to extract video URL')
722 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
724 self._downloader.trouble(u'ERROR: unable to extract video URL')
727 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
729 # TODO: support choosing qualities
731 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
733 self._downloader.trouble(u'ERROR: unable to extract title')
735 video_title = unescapeHTML(mobj.group('title'))
737 video_uploader = None
738 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
740 # lookin for official user
741 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
742 if mobj_official is None:
743 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
745 video_uploader = mobj_official.group(1)
747 video_uploader = mobj.group(1)
749 video_upload_date = None
750 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
752 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
757 'uploader': video_uploader,
758 'upload_date': video_upload_date,
759 'title': video_title,
760 'ext': video_extension,
764 class PhotobucketIE(InfoExtractor):
765 """Information extractor for photobucket.com."""
767 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
768 IE_NAME = u'photobucket'
770 def __init__(self, downloader=None):
771 InfoExtractor.__init__(self, downloader)
773 def report_download_webpage(self, video_id):
774 """Report webpage download."""
775 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
777 def report_extraction(self, video_id):
778 """Report information extraction."""
779 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
781 def _real_extract(self, url):
782 # Extract id from URL
783 mobj = re.match(self._VALID_URL, url)
785 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
788 video_id = mobj.group(1)
790 video_extension = 'flv'
792 # Retrieve video webpage to extract further information
793 request = compat_urllib_request.Request(url)
795 self.report_download_webpage(video_id)
796 webpage = compat_urllib_request.urlopen(request).read()
797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
798 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
801 # Extract URL, uploader, and title from webpage
802 self.report_extraction(video_id)
803 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
805 self._downloader.trouble(u'ERROR: unable to extract media URL')
807 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
811 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
813 self._downloader.trouble(u'ERROR: unable to extract title')
815 video_title = mobj.group(1).decode('utf-8')
817 video_uploader = mobj.group(2).decode('utf-8')
820 'id': video_id.decode('utf-8'),
821 'url': video_url.decode('utf-8'),
822 'uploader': video_uploader,
824 'title': video_title,
825 'ext': video_extension.decode('utf-8'),
829 class YahooIE(InfoExtractor):
830 """Information extractor for video.yahoo.com."""
833 # _VALID_URL matches all Yahoo! Video URLs
834 # _VPAGE_URL matches only the extractable '/watch/' URLs
835 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
836 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
837 IE_NAME = u'video.yahoo'
839 def __init__(self, downloader=None):
840 InfoExtractor.__init__(self, downloader)
842 def report_download_webpage(self, video_id):
843 """Report webpage download."""
844 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
846 def report_extraction(self, video_id):
847 """Report information extraction."""
848 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
850 def _real_extract(self, url, new_video=True):
851 # Extract ID from URL
852 mobj = re.match(self._VALID_URL, url)
854 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
857 video_id = mobj.group(2)
858 video_extension = 'flv'
860 # Rewrite valid but non-extractable URLs as
861 # extractable English language /watch/ URLs
862 if re.match(self._VPAGE_URL, url) is None:
863 request = compat_urllib_request.Request(url)
865 webpage = compat_urllib_request.urlopen(request).read()
866 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
867 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
870 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
872 self._downloader.trouble(u'ERROR: Unable to extract id field')
874 yahoo_id = mobj.group(1)
876 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
878 self._downloader.trouble(u'ERROR: Unable to extract vid field')
880 yahoo_vid = mobj.group(1)
882 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
883 return self._real_extract(url, new_video=False)
885 # Retrieve video webpage to extract further information
886 request = compat_urllib_request.Request(url)
888 self.report_download_webpage(video_id)
889 webpage = compat_urllib_request.urlopen(request).read()
890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
894 # Extract uploader and title from webpage
895 self.report_extraction(video_id)
896 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
898 self._downloader.trouble(u'ERROR: unable to extract video title')
900 video_title = mobj.group(1).decode('utf-8')
902 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
904 self._downloader.trouble(u'ERROR: unable to extract video uploader')
906 video_uploader = mobj.group(1).decode('utf-8')
908 # Extract video thumbnail
909 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
911 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
913 video_thumbnail = mobj.group(1).decode('utf-8')
915 # Extract video description
916 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
918 self._downloader.trouble(u'ERROR: unable to extract video description')
920 video_description = mobj.group(1).decode('utf-8')
921 if not video_description:
922 video_description = 'No description available.'
924 # Extract video height and width
925 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
927 self._downloader.trouble(u'ERROR: unable to extract video height')
929 yv_video_height = mobj.group(1)
931 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
933 self._downloader.trouble(u'ERROR: unable to extract video width')
935 yv_video_width = mobj.group(1)
937 # Retrieve video playlist to extract media URL
938 # I'm not completely sure what all these options are, but we
939 # seem to need most of them, otherwise the server sends a 401.
940 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
941 yv_bitrate = '700' # according to Wikipedia this is hard-coded
942 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
943 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
944 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
946 self.report_download_webpage(video_id)
947 webpage = compat_urllib_request.urlopen(request).read()
948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
949 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
952 # Extract media URL from playlist XML
953 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
955 self._downloader.trouble(u'ERROR: Unable to extract media URL')
957 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
958 video_url = unescapeHTML(video_url)
961 'id': video_id.decode('utf-8'),
963 'uploader': video_uploader,
965 'title': video_title,
966 'ext': video_extension.decode('utf-8'),
967 'thumbnail': video_thumbnail.decode('utf-8'),
968 'description': video_description,
972 class VimeoIE(InfoExtractor):
973 """Information extractor for vimeo.com."""
975 # _VALID_URL matches Vimeo URLs
976 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
979 def __init__(self, downloader=None):
980 InfoExtractor.__init__(self, downloader)
982 def report_download_webpage(self, video_id):
983 """Report webpage download."""
984 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
986 def report_extraction(self, video_id):
987 """Report information extraction."""
988 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
990 def _real_extract(self, url, new_video=True):
991 # Extract ID from URL
992 mobj = re.match(self._VALID_URL, url)
994 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
997 video_id = mobj.group(1)
999 # Retrieve video webpage to extract further information
1000 request = compat_urllib_request.Request(url, None, std_headers)
1002 self.report_download_webpage(video_id)
1003 webpage_bytes = compat_urllib_request.urlopen(request).read()
1004 webpage = webpage_bytes.decode('utf-8')
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Now we begin extracting as much information as we can from what we
1010 # retrieved. First we extract the information common to all extractors,
1011 # and latter we extract those that are Vimeo specific.
1012 self.report_extraction(video_id)
1014 # Extract the config JSON
1016 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1017 config = json.loads(config)
1019 self._downloader.trouble(u'ERROR: unable to extract info section')
1023 video_title = config["video"]["title"]
1025 # Extract uploader and uploader_id
1026 video_uploader = config["video"]["owner"]["name"]
1027 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1029 # Extract video thumbnail
1030 video_thumbnail = config["video"]["thumbnail"]
1032 # Extract video description
1033 video_description = get_element_by_attribute("itemprop", "description", webpage)
1034 if video_description: video_description = clean_html(video_description)
1035 else: video_description = ''
1037 # Extract upload date
1038 video_upload_date = None
1039 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1040 if mobj is not None:
1041 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1043 # Vimeo specific: extract request signature and timestamp
1044 sig = config['request']['signature']
1045 timestamp = config['request']['timestamp']
1047 # Vimeo specific: extract video codec and quality information
1048 # First consider quality, then codecs, then take everything
1049 # TODO bind to format param
1050 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1051 files = { 'hd': [], 'sd': [], 'other': []}
1052 for codec_name, codec_extension in codecs:
1053 if codec_name in config["video"]["files"]:
1054 if 'hd' in config["video"]["files"][codec_name]:
1055 files['hd'].append((codec_name, codec_extension, 'hd'))
1056 elif 'sd' in config["video"]["files"][codec_name]:
1057 files['sd'].append((codec_name, codec_extension, 'sd'))
1059 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1061 for quality in ('hd', 'sd', 'other'):
1062 if len(files[quality]) > 0:
1063 video_quality = files[quality][0][2]
1064 video_codec = files[quality][0][0]
1065 video_extension = files[quality][0][1]
1066 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1069 self._downloader.trouble(u'ERROR: no known codec found')
1072 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1073 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1078 'uploader': video_uploader,
1079 'uploader_id': video_uploader_id,
1080 'upload_date': video_upload_date,
1081 'title': video_title,
1082 'ext': video_extension,
1083 'thumbnail': video_thumbnail,
1084 'description': video_description,
1088 class ArteTvIE(InfoExtractor):
1089 """arte.tv information extractor."""
1091 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1092 _LIVE_URL = r'index-[0-9]+\.html$'
1094 IE_NAME = u'arte.tv'
1096 def __init__(self, downloader=None):
1097 InfoExtractor.__init__(self, downloader)
1099 def report_download_webpage(self, video_id):
1100 """Report webpage download."""
1101 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1103 def report_extraction(self, video_id):
1104 """Report information extraction."""
1105 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1107 def fetch_webpage(self, url):
1108 request = compat_urllib_request.Request(url)
1110 self.report_download_webpage(url)
1111 webpage = compat_urllib_request.urlopen(request).read()
1112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1113 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1115 except ValueError as err:
1116 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1120 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1121 page = self.fetch_webpage(url)
1122 mobj = re.search(regex, page, regexFlags)
1126 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1129 for (i, key, err) in matchTuples:
1130 if mobj.group(i) is None:
1131 self._downloader.trouble(err)
1134 info[key] = mobj.group(i)
1138 def extractLiveStream(self, url):
1139 video_lang = url.split('/')[-4]
1140 info = self.grep_webpage(
1142 r'src="(.*?/videothek_js.*?\.js)',
1145 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1148 http_host = url.split('/')[2]
1149 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1150 info = self.grep_webpage(
1152 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1153 '(http://.*?\.swf).*?' +
1157 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1158 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1159 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1162 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1164 def extractPlus7Stream(self, url):
1165 video_lang = url.split('/')[-3]
1166 info = self.grep_webpage(
1168 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1171 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1174 next_url = compat_urllib_parse.unquote(info.get('url'))
1175 info = self.grep_webpage(
1177 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1180 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1183 next_url = compat_urllib_parse.unquote(info.get('url'))
1185 info = self.grep_webpage(
1187 r'<video id="(.*?)".*?>.*?' +
1188 '<name>(.*?)</name>.*?' +
1189 '<dateVideo>(.*?)</dateVideo>.*?' +
1190 '<url quality="hd">(.*?)</url>',
1193 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1194 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1195 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1196 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1201 'id': info.get('id'),
1202 'url': compat_urllib_parse.unquote(info.get('url')),
1203 'uploader': u'arte.tv',
1204 'upload_date': info.get('date'),
1205 'title': info.get('title').decode('utf-8'),
1211 def _real_extract(self, url):
1212 video_id = url.split('/')[-1]
1213 self.report_extraction(video_id)
1215 if re.search(self._LIVE_URL, video_id) is not None:
1216 self.extractLiveStream(url)
1219 info = self.extractPlus7Stream(url)
1224 class GenericIE(InfoExtractor):
1225 """Generic last-resort information extractor."""
1228 IE_NAME = u'generic'
1230 def __init__(self, downloader=None):
1231 InfoExtractor.__init__(self, downloader)
1233 def report_download_webpage(self, video_id):
1234 """Report webpage download."""
1235 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1236 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1238 def report_extraction(self, video_id):
1239 """Report information extraction."""
1240 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1242 def report_following_redirect(self, new_url):
1243 """Report information extraction."""
1244 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1246 def _test_redirect(self, url):
1247 """Check if it is a redirect, like url shorteners, in case restart chain."""
1248 class HeadRequest(compat_urllib_request.Request):
1249 def get_method(self):
1252 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1254 Subclass the HTTPRedirectHandler to make it use our
1255 HeadRequest also on the redirected URL
1257 def redirect_request(self, req, fp, code, msg, headers, newurl):
1258 if code in (301, 302, 303, 307):
1259 newurl = newurl.replace(' ', '%20')
1260 newheaders = dict((k,v) for k,v in req.headers.items()
1261 if k.lower() not in ("content-length", "content-type"))
1262 return HeadRequest(newurl,
1264 origin_req_host=req.get_origin_req_host(),
1267 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1269 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1271 Fallback to GET if HEAD is not allowed (405 HTTP error)
1273 def http_error_405(self, req, fp, code, msg, headers):
1277 newheaders = dict((k,v) for k,v in req.headers.items()
1278 if k.lower() not in ("content-length", "content-type"))
1279 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1281 origin_req_host=req.get_origin_req_host(),
1285 opener = compat_urllib_request.OpenerDirector()
1286 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1287 HTTPMethodFallback, HEADRedirectHandler,
1288 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1289 opener.add_handler(handler())
1291 response = opener.open(HeadRequest(url))
1292 new_url = response.geturl()
1297 self.report_following_redirect(new_url)
1298 self._downloader.download([new_url])
1301 def _real_extract(self, url):
1302 if self._test_redirect(url): return
1304 video_id = url.split('/')[-1]
1305 request = compat_urllib_request.Request(url)
1307 self.report_download_webpage(video_id)
1308 webpage = compat_urllib_request.urlopen(request).read()
1309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1310 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1312 except ValueError as err:
1313 # since this is the last-resort InfoExtractor, if
1314 # this error is thrown, it'll be thrown here
1315 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1318 self.report_extraction(video_id)
1319 # Start with something easy: JW Player in SWFObject
1320 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1322 # Broaden the search a little bit
1323 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1325 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1328 # It's possible that one of the regexes
1329 # matched, but returned an empty group:
1330 if mobj.group(1) is None:
1331 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1334 video_url = compat_urllib_parse.unquote(mobj.group(1))
1335 video_id = os.path.basename(video_url)
1337 # here's a fun little line of code for you:
1338 video_extension = os.path.splitext(video_id)[1][1:]
1339 video_id = os.path.splitext(video_id)[0]
1341 # it's tempting to parse this further, but you would
1342 # have to take into account all the variations like
1343 # Video Title - Site Name
1344 # Site Name | Video Title
1345 # Video Title - Tagline | Site Name
1346 # and so on and so forth; it's just not practical
1347 mobj = re.search(r'<title>(.*)</title>', webpage)
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1351 video_title = mobj.group(1)
1353 # video uploader is domain name
1354 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1356 self._downloader.trouble(u'ERROR: unable to extract title')
1358 video_uploader = mobj.group(1)
1363 'uploader': video_uploader,
1364 'upload_date': None,
1365 'title': video_title,
1366 'ext': video_extension,
1370 class YoutubeSearchIE(InfoExtractor):
1371 """Information Extractor for YouTube search queries."""
1372 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1373 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1374 _max_youtube_results = 1000
1375 IE_NAME = u'youtube:search'
1377 def __init__(self, downloader=None):
1378 InfoExtractor.__init__(self, downloader)
1380 def report_download_page(self, query, pagenum):
1381 """Report attempt to download search page with given number."""
1382 query = query.decode(preferredencoding())
1383 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1385 def _real_extract(self, query):
1386 mobj = re.match(self._VALID_URL, query)
1388 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1391 prefix, query = query.split(':')
1393 query = query.encode('utf-8')
1395 self._download_n_results(query, 1)
1397 elif prefix == 'all':
1398 self._download_n_results(query, self._max_youtube_results)
1404 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1406 elif n > self._max_youtube_results:
1407 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1408 n = self._max_youtube_results
1409 self._download_n_results(query, n)
1411 except ValueError: # parsing prefix as integer fails
1412 self._download_n_results(query, 1)
1415 def _download_n_results(self, query, n):
1416 """Downloads a specified number of results for a query"""
1422 while (50 * pagenum) < limit:
1423 self.report_download_page(query, pagenum+1)
1424 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1425 request = compat_urllib_request.Request(result_url)
1427 data = compat_urllib_request.urlopen(request).read()
1428 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1429 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1431 api_response = json.loads(data)['data']
1433 new_ids = list(video['id'] for video in api_response['items'])
1434 video_ids += new_ids
1436 limit = min(n, api_response['totalItems'])
1439 if len(video_ids) > n:
1440 video_ids = video_ids[:n]
1441 for id in video_ids:
1442 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1446 class GoogleSearchIE(InfoExtractor):
1447 """Information Extractor for Google Video search queries."""
1448 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1449 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1450 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1451 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1452 _max_google_results = 1000
1453 IE_NAME = u'video.google:search'
1455 def __init__(self, downloader=None):
1456 InfoExtractor.__init__(self, downloader)
1458 def report_download_page(self, query, pagenum):
1459 """Report attempt to download playlist page with given number."""
1460 query = query.decode(preferredencoding())
1461 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1463 def _real_extract(self, query):
1464 mobj = re.match(self._VALID_URL, query)
1466 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1469 prefix, query = query.split(':')
1471 query = query.encode('utf-8')
1473 self._download_n_results(query, 1)
1475 elif prefix == 'all':
1476 self._download_n_results(query, self._max_google_results)
1482 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1484 elif n > self._max_google_results:
1485 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1486 n = self._max_google_results
1487 self._download_n_results(query, n)
1489 except ValueError: # parsing prefix as integer fails
1490 self._download_n_results(query, 1)
1493 def _download_n_results(self, query, n):
1494 """Downloads a specified number of results for a query"""
1500 self.report_download_page(query, pagenum)
1501 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1502 request = compat_urllib_request.Request(result_url)
1504 page = compat_urllib_request.urlopen(request).read()
1505 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1506 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1509 # Extract video identifiers
1510 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1511 video_id = mobj.group(1)
1512 if video_id not in video_ids:
1513 video_ids.append(video_id)
1514 if len(video_ids) == n:
1515 # Specified n videos reached
1516 for id in video_ids:
1517 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1520 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1521 for id in video_ids:
1522 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1525 pagenum = pagenum + 1
1528 class YahooSearchIE(InfoExtractor):
1529 """Information Extractor for Yahoo! Video search queries."""
1532 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1533 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1534 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1535 _MORE_PAGES_INDICATOR = r'\s*Next'
1536 _max_yahoo_results = 1000
1537 IE_NAME = u'video.yahoo:search'
1539 def __init__(self, downloader=None):
1540 InfoExtractor.__init__(self, downloader)
1542 def report_download_page(self, query, pagenum):
1543 """Report attempt to download playlist page with given number."""
1544 query = query.decode(preferredencoding())
1545 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1547 def _real_extract(self, query):
1548 mobj = re.match(self._VALID_URL, query)
1550 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1553 prefix, query = query.split(':')
1555 query = query.encode('utf-8')
1557 self._download_n_results(query, 1)
1559 elif prefix == 'all':
1560 self._download_n_results(query, self._max_yahoo_results)
1566 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1568 elif n > self._max_yahoo_results:
1569 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1570 n = self._max_yahoo_results
1571 self._download_n_results(query, n)
1573 except ValueError: # parsing prefix as integer fails
1574 self._download_n_results(query, 1)
1577 def _download_n_results(self, query, n):
1578 """Downloads a specified number of results for a query"""
1581 already_seen = set()
1585 self.report_download_page(query, pagenum)
1586 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1587 request = compat_urllib_request.Request(result_url)
1589 page = compat_urllib_request.urlopen(request).read()
1590 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1591 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1594 # Extract video identifiers
1595 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1596 video_id = mobj.group(1)
1597 if video_id not in already_seen:
1598 video_ids.append(video_id)
1599 already_seen.add(video_id)
1600 if len(video_ids) == n:
1601 # Specified n videos reached
1602 for id in video_ids:
1603 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1606 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1607 for id in video_ids:
1608 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1611 pagenum = pagenum + 1
1614 class YoutubePlaylistIE(InfoExtractor):
1615 """Information Extractor for YouTube playlists."""
1617 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1618 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1619 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1620 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1621 IE_NAME = u'youtube:playlist'
1623 def __init__(self, downloader=None):
1624 InfoExtractor.__init__(self, downloader)
1626 def report_download_page(self, playlist_id, pagenum):
1627 """Report attempt to download playlist page with given number."""
1628 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1630 def _real_extract(self, url):
1631 # Extract playlist id
1632 mobj = re.match(self._VALID_URL, url)
1634 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1638 if mobj.group(3) is not None:
1639 self._downloader.download([mobj.group(3)])
1642 # Download playlist pages
1643 # prefix is 'p' as default for playlists but there are other types that need extra care
1644 playlist_prefix = mobj.group(1)
1645 if playlist_prefix == 'a':
1646 playlist_access = 'artist'
1648 playlist_prefix = 'p'
1649 playlist_access = 'view_play_list'
1650 playlist_id = mobj.group(2)
1655 self.report_download_page(playlist_id, pagenum)
1656 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1657 request = compat_urllib_request.Request(url)
1659 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1660 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1661 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1664 # Extract video identifiers
1666 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1667 if mobj.group(1) not in ids_in_page:
1668 ids_in_page.append(mobj.group(1))
1669 video_ids.extend(ids_in_page)
1671 if self._MORE_PAGES_INDICATOR not in page:
1673 pagenum = pagenum + 1
1675 total = len(video_ids)
1677 playliststart = self._downloader.params.get('playliststart', 1) - 1
1678 playlistend = self._downloader.params.get('playlistend', -1)
1679 if playlistend == -1:
1680 video_ids = video_ids[playliststart:]
1682 video_ids = video_ids[playliststart:playlistend]
1684 if len(video_ids) == total:
1685 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1687 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1689 for id in video_ids:
1690 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1694 class YoutubeChannelIE(InfoExtractor):
1695 """Information Extractor for YouTube channels."""
1697 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1698 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1699 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700 IE_NAME = u'youtube:channel'
1702 def report_download_page(self, channel_id, pagenum):
1703 """Report attempt to download channel page with given number."""
1704 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1706 def _real_extract(self, url):
1707 # Extract channel id
1708 mobj = re.match(self._VALID_URL, url)
1710 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1713 # Download channel pages
1714 channel_id = mobj.group(1)
1719 self.report_download_page(channel_id, pagenum)
1720 url = self._TEMPLATE_URL % (channel_id, pagenum)
1721 request = compat_urllib_request.Request(url)
1723 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728 # Extract video identifiers
1730 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1731 if mobj.group(1) not in ids_in_page:
1732 ids_in_page.append(mobj.group(1))
1733 video_ids.extend(ids_in_page)
1735 if self._MORE_PAGES_INDICATOR not in page:
1737 pagenum = pagenum + 1
1739 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1741 for id in video_ids:
1742 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746 class YoutubeUserIE(InfoExtractor):
1747 """Information Extractor for YouTube users."""
1749 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1750 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1751 _GDATA_PAGE_SIZE = 50
1752 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1753 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1754 IE_NAME = u'youtube:user'
1756 def __init__(self, downloader=None):
1757 InfoExtractor.__init__(self, downloader)
1759 def report_download_page(self, username, start_index):
1760 """Report attempt to download user page."""
1761 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1762 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1764 def _real_extract(self, url):
1766 mobj = re.match(self._VALID_URL, url)
1768 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1771 username = mobj.group(1)
1773 # Download video ids using YouTube Data API. Result size per
1774 # query is limited (currently to 50 videos) so we need to query
1775 # page by page until there are no video ids - it means we got
1782 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1783 self.report_download_page(username, start_index)
1785 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1788 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1789 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1793 # Extract video identifiers
1796 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1797 if mobj.group(1) not in ids_in_page:
1798 ids_in_page.append(mobj.group(1))
1800 video_ids.extend(ids_in_page)
1802 # A little optimization - if current page is not
1803 # "full", ie. does not contain PAGE_SIZE video ids then
1804 # we can assume that this page is the last one - there
1805 # are no more ids on further pages - no need to query
1808 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1813 all_ids_count = len(video_ids)
1814 playliststart = self._downloader.params.get('playliststart', 1) - 1
1815 playlistend = self._downloader.params.get('playlistend', -1)
1817 if playlistend == -1:
1818 video_ids = video_ids[playliststart:]
1820 video_ids = video_ids[playliststart:playlistend]
1822 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1823 (username, all_ids_count, len(video_ids)))
1825 for video_id in video_ids:
1826 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1829 class BlipTVUserIE(InfoExtractor):
1830 """Information Extractor for blip.tv users."""
1832 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1834 IE_NAME = u'blip.tv:user'
1836 def __init__(self, downloader=None):
1837 InfoExtractor.__init__(self, downloader)
1839 def report_download_page(self, username, pagenum):
1840 """Report attempt to download user page."""
1841 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1842 (self.IE_NAME, username, pagenum))
1844 def _real_extract(self, url):
1846 mobj = re.match(self._VALID_URL, url)
1848 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1851 username = mobj.group(1)
1853 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1855 request = compat_urllib_request.Request(url)
1858 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859 mobj = re.search(r'data-users-id="([^"]+)"', page)
1860 page_base = page_base % mobj.group(1)
1861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1866 # Download video ids using BlipTV Ajax calls. Result size per
1867 # query is limited (currently to 12 videos) so we need to query
1868 # page by page until there are no video ids - it means we got
1875 self.report_download_page(username, pagenum)
1877 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1880 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1881 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1882 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1885 # Extract video identifiers
1888 for mobj in re.finditer(r'href="/([^"]+)"', page):
1889 if mobj.group(1) not in ids_in_page:
1890 ids_in_page.append(unescapeHTML(mobj.group(1)))
1892 video_ids.extend(ids_in_page)
1894 # A little optimization - if current page is not
1895 # "full", ie. does not contain PAGE_SIZE video ids then
1896 # we can assume that this page is the last one - there
1897 # are no more ids on further pages - no need to query
1900 if len(ids_in_page) < self._PAGE_SIZE:
1905 all_ids_count = len(video_ids)
1906 playliststart = self._downloader.params.get('playliststart', 1) - 1
1907 playlistend = self._downloader.params.get('playlistend', -1)
1909 if playlistend == -1:
1910 video_ids = video_ids[playliststart:]
1912 video_ids = video_ids[playliststart:playlistend]
1914 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1915 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1917 for video_id in video_ids:
1918 self._downloader.download([u'http://blip.tv/'+video_id])
1921 class DepositFilesIE(InfoExtractor):
1922 """Information extractor for depositfiles.com"""
1924 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1926 def report_download_webpage(self, file_id):
1927 """Report webpage download."""
1928 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1930 def report_extraction(self, file_id):
1931 """Report information extraction."""
1932 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1934 def _real_extract(self, url):
1935 file_id = url.split('/')[-1]
1936 # Rebuild url in english locale
1937 url = 'http://depositfiles.com/en/files/' + file_id
1939 # Retrieve file webpage with 'Free download' button pressed
1940 free_download_indication = { 'gateway_result' : '1' }
1941 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1943 self.report_download_webpage(file_id)
1944 webpage = compat_urllib_request.urlopen(request).read()
1945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1946 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1949 # Search for the real file URL
1950 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1951 if (mobj is None) or (mobj.group(1) is None):
1952 # Try to figure out reason of the error.
1953 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1954 if (mobj is not None) and (mobj.group(1) is not None):
1955 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1956 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1958 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1961 file_url = mobj.group(1)
1962 file_extension = os.path.splitext(file_url)[1][1:]
1964 # Search for file title
1965 mobj = re.search(r'<b title="(.*?)">', webpage)
1967 self._downloader.trouble(u'ERROR: unable to extract title')
1969 file_title = mobj.group(1).decode('utf-8')
1972 'id': file_id.decode('utf-8'),
1973 'url': file_url.decode('utf-8'),
1975 'upload_date': None,
1976 'title': file_title,
1977 'ext': file_extension.decode('utf-8'),
1981 class FacebookIE(InfoExtractor):
1982 """Information Extractor for Facebook"""
1984 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1985 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1986 _NETRC_MACHINE = 'facebook'
1987 IE_NAME = u'facebook'
1989 def report_login(self):
1990 """Report attempt to log in."""
1991 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1993 def _real_initialize(self):
1994 if self._downloader is None:
1999 downloader_params = self._downloader.params
2001 # Attempt to use provided username and password or .netrc data
2002 if downloader_params.get('username', None) is not None:
2003 useremail = downloader_params['username']
2004 password = downloader_params['password']
2005 elif downloader_params.get('usenetrc', False):
2007 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2008 if info is not None:
2012 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2013 except (IOError, netrc.NetrcParseError) as err:
2014 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2017 if useremail is None:
2026 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2029 login_results = compat_urllib_request.urlopen(request).read()
2030 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2031 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2033 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2034 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2037 def _real_extract(self, url):
2038 mobj = re.match(self._VALID_URL, url)
2040 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2042 video_id = mobj.group('ID')
2044 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2045 webpage = self._download_webpage(url, video_id)
2047 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2048 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2049 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2051 raise ExtractorError(u'Cannot parse data')
2052 data = dict(json.loads(m.group(1)))
2053 video_url = compat_urllib_parse.unquote(data['hd_src'])
2054 video_duration = int(data['video_duration'])
2056 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2058 raise ExtractorError(u'Cannot find title in webpage')
2059 video_title = unescapeHTML(m.group(1))
2063 'title': video_title,
2066 'duration': video_duration,
2067 'thumbnail': data['thumbnail_src'],
2072 class BlipTVIE(InfoExtractor):
2073 """Information extractor for blip.tv"""
2075 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2076 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2077 IE_NAME = u'blip.tv'
2079 def report_extraction(self, file_id):
2080 """Report information extraction."""
2081 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2083 def report_direct_download(self, title):
2084 """Report information extraction."""
2085 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2087 def _real_extract(self, url):
2088 mobj = re.match(self._VALID_URL, url)
2090 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2097 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2098 request = compat_urllib_request.Request(json_url)
2099 request.add_header('User-Agent', 'iTunes/10.6.1')
2100 self.report_extraction(mobj.group(1))
2103 urlh = compat_urllib_request.urlopen(request)
2104 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2105 basename = url.split('/')[-1]
2106 title,ext = os.path.splitext(basename)
2107 title = title.decode('UTF-8')
2108 ext = ext.replace('.', '')
2109 self.report_direct_download(title)
2114 'upload_date': None,
2119 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2120 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2121 if info is None: # Regular URL
2123 json_code_bytes = urlh.read()
2124 json_code = json_code_bytes.decode('utf-8')
2125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2126 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2130 json_data = json.loads(json_code)
2131 if 'Post' in json_data:
2132 data = json_data['Post']
2136 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2137 video_url = data['media']['url']
2138 umobj = re.match(self._URL_EXT, video_url)
2140 raise ValueError('Can not determine filename extension')
2141 ext = umobj.group(1)
2144 'id': data['item_id'],
2146 'uploader': data['display_name'],
2147 'upload_date': upload_date,
2148 'title': data['title'],
2150 'format': data['media']['mimeType'],
2151 'thumbnail': data['thumbnailUrl'],
2152 'description': data['description'],
2153 'player_url': data['embedUrl'],
2154 'user_agent': 'iTunes/10.6.1',
2156 except (ValueError,KeyError) as err:
2157 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2163 class MyVideoIE(InfoExtractor):
2164 """Information Extractor for myvideo.de."""
2166 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2167 IE_NAME = u'myvideo'
2169 def __init__(self, downloader=None):
2170 InfoExtractor.__init__(self, downloader)
2172 def report_extraction(self, video_id):
2173 """Report information extraction."""
2174 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2176 def _real_extract(self,url):
2177 mobj = re.match(self._VALID_URL, url)
2179 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2182 video_id = mobj.group(1)
2185 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2186 webpage = self._download_webpage(webpage_url, video_id)
2188 self.report_extraction(video_id)
2189 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2192 self._downloader.trouble(u'ERROR: unable to extract media URL')
2194 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2196 mobj = re.search('<title>([^<]+)</title>', webpage)
2198 self._downloader.trouble(u'ERROR: unable to extract title')
2201 video_title = mobj.group(1)
2207 'upload_date': None,
2208 'title': video_title,
2212 class ComedyCentralIE(InfoExtractor):
2213 """Information extractor for The Daily Show and Colbert Report """
2215 # urls can be abbreviations like :thedailyshow or :colbert
2216 # urls for episodes like:
2217 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2218 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2219 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2220 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2221 |(https?://)?(www\.)?
2222 (?P<showname>thedailyshow|colbertnation)\.com/
2223 (full-episodes/(?P<episode>.*)|
2225 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2226 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2229 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2231 _video_extensions = {
2239 _video_dimensions = {
2248 def suitable(self, url):
2249 """Receives a URL and returns True if suitable for this IE."""
2250 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2252 def report_extraction(self, episode_id):
2253 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2255 def report_config_download(self, episode_id, media_id):
2256 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2258 def report_index_download(self, episode_id):
2259 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2261 def _print_formats(self, formats):
2262 print('Available formats:')
2264 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2267 def _real_extract(self, url):
2268 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2270 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2273 if mobj.group('shortname'):
2274 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2275 url = u'http://www.thedailyshow.com/full-episodes/'
2277 url = u'http://www.colbertnation.com/full-episodes/'
2278 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2279 assert mobj is not None
2281 if mobj.group('clip'):
2282 if mobj.group('showname') == 'thedailyshow':
2283 epTitle = mobj.group('tdstitle')
2285 epTitle = mobj.group('cntitle')
2288 dlNewest = not mobj.group('episode')
2290 epTitle = mobj.group('showname')
2292 epTitle = mobj.group('episode')
2294 req = compat_urllib_request.Request(url)
2295 self.report_extraction(epTitle)
2297 htmlHandle = compat_urllib_request.urlopen(req)
2298 html = htmlHandle.read()
2299 webpage = html.decode('utf-8')
2300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2301 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2304 url = htmlHandle.geturl()
2305 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2307 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2309 if mobj.group('episode') == '':
2310 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2312 epTitle = mobj.group('episode')
2314 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2316 if len(mMovieParams) == 0:
2317 # The Colbert Report embeds the information in a without
2318 # a URL prefix; so extract the alternate reference
2319 # and then add the URL prefix manually.
2321 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2322 if len(altMovieParams) == 0:
2323 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2326 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2328 uri = mMovieParams[0][1]
2329 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2330 self.report_index_download(epTitle)
2332 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2333 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2334 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2339 idoc = xml.etree.ElementTree.fromstring(indexXml)
2340 itemEls = idoc.findall('.//item')
2341 for partNum,itemEl in enumerate(itemEls):
2342 mediaId = itemEl.findall('./guid')[0].text
2343 shortMediaId = mediaId.split(':')[-1]
2344 showId = mediaId.split(':')[-2].replace('.com', '')
2345 officialTitle = itemEl.findall('./title')[0].text
2346 officialDate = itemEl.findall('./pubDate')[0].text
2348 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2349 compat_urllib_parse.urlencode({'uri': mediaId}))
2350 configReq = compat_urllib_request.Request(configUrl)
2351 self.report_config_download(epTitle, shortMediaId)
2353 configXml = compat_urllib_request.urlopen(configReq).read()
2354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2355 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2358 cdoc = xml.etree.ElementTree.fromstring(configXml)
2360 for rendition in cdoc.findall('.//rendition'):
2361 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2365 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2368 if self._downloader.params.get('listformats', None):
2369 self._print_formats([i[0] for i in turls])
2372 # For now, just pick the highest bitrate
2373 format,rtmp_video_url = turls[-1]
2375 # Get the format arg from the arg stream
2376 req_format = self._downloader.params.get('format', None)
2378 # Select format if we can find one
2381 format, rtmp_video_url = f, v
2384 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2386 raise ExtractorError(u'Cannot transform RTMP url')
2387 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2388 video_url = base + m.group('finalid')
2390 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2395 'upload_date': officialDate,
2400 'description': officialTitle,
2402 results.append(info)
2407 class EscapistIE(InfoExtractor):
2408 """Information extractor for The Escapist """
2410 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2411 IE_NAME = u'escapist'
2413 def report_extraction(self, showName):
2414 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2416 def report_config_download(self, showName):
2417 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2419 def _real_extract(self, url):
2420 mobj = re.match(self._VALID_URL, url)
2422 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2424 showName = mobj.group('showname')
2425 videoId = mobj.group('episode')
2427 self.report_extraction(showName)
2429 webPage = compat_urllib_request.urlopen(url)
2430 webPageBytes = webPage.read()
2431 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2432 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2433 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2434 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2437 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2438 description = unescapeHTML(descMatch.group(1))
2439 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2440 imgUrl = unescapeHTML(imgMatch.group(1))
2441 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2442 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2443 configUrlMatch = re.search('config=(.*)$', playerUrl)
2444 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2446 self.report_config_download(showName)
2448 configJSON = compat_urllib_request.urlopen(configUrl)
2449 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2450 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2452 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2455 # Technically, it's JavaScript, not JSON
2456 configJSON = configJSON.replace("'", '"')
2459 config = json.loads(configJSON)
2460 except (ValueError,) as err:
2461 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2464 playlist = config['playlist']
2465 videoUrl = playlist[1]['url']
2470 'uploader': showName,
2471 'upload_date': None,
2474 'thumbnail': imgUrl,
2475 'description': description,
2476 'player_url': playerUrl,
2481 class CollegeHumorIE(InfoExtractor):
2482 """Information extractor for collegehumor.com"""
2485 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2486 IE_NAME = u'collegehumor'
2488 def report_manifest(self, video_id):
2489 """Report information extraction."""
2490 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2492 def report_extraction(self, video_id):
2493 """Report information extraction."""
2494 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2496 def _real_extract(self, url):
2497 mobj = re.match(self._VALID_URL, url)
2499 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2501 video_id = mobj.group('videoid')
2506 'upload_date': None,
2509 self.report_extraction(video_id)
2510 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2512 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2514 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2517 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2519 videoNode = mdoc.findall('./video')[0]
2520 info['description'] = videoNode.findall('./description')[0].text
2521 info['title'] = videoNode.findall('./caption')[0].text
2522 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2523 manifest_url = videoNode.findall('./file')[0].text
2525 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2528 manifest_url += '?hdcore=2.10.3'
2529 self.report_manifest(video_id)
2531 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2532 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2536 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2538 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2539 node_id = media_node.attrib['url']
2540 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2541 except IndexError as err:
2542 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2545 url_pr = compat_urllib_parse_urlparse(manifest_url)
2546 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2553 class XVideosIE(InfoExtractor):
2554 """Information extractor for xvideos.com"""
2556 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2557 IE_NAME = u'xvideos'
2559 def report_extraction(self, video_id):
2560 """Report information extraction."""
2561 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2563 def _real_extract(self, url):
2564 mobj = re.match(self._VALID_URL, url)
2566 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2568 video_id = mobj.group(1)
2570 webpage = self._download_webpage(url, video_id)
2572 self.report_extraction(video_id)
2576 mobj = re.search(r'flv_url=(.+?)&', webpage)
2578 self._downloader.trouble(u'ERROR: unable to extract video url')
2580 video_url = compat_urllib_parse.unquote(mobj.group(1))
2584 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2586 self._downloader.trouble(u'ERROR: unable to extract video title')
2588 video_title = mobj.group(1)
2591 # Extract video thumbnail
2592 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2594 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2596 video_thumbnail = mobj.group(0)
2602 'upload_date': None,
2603 'title': video_title,
2605 'thumbnail': video_thumbnail,
2606 'description': None,
2612 class SoundcloudIE(InfoExtractor):
2613 """Information extractor for soundcloud.com
2614 To access the media, the uid of the song and a stream token
2615 must be extracted from the page source and the script must make
2616 a request to media.soundcloud.com/crossdomain.xml. Then
2617 the media can be grabbed by requesting from an url composed
2618 of the stream token and uid
2621 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2622 IE_NAME = u'soundcloud'
2624 def __init__(self, downloader=None):
2625 InfoExtractor.__init__(self, downloader)
2627 def report_resolve(self, video_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2631 def report_extraction(self, video_id):
2632 """Report information extraction."""
2633 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2635 def _real_extract(self, url):
2636 mobj = re.match(self._VALID_URL, url)
2638 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2641 # extract uploader (which is in the url)
2642 uploader = mobj.group(1)
2643 # extract simple title (uploader + slug of song title)
2644 slug_title = mobj.group(2)
2645 simple_title = uploader + u'-' + slug_title
2647 self.report_resolve('%s/%s' % (uploader, slug_title))
2649 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2650 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2651 request = compat_urllib_request.Request(resolv_url)
2653 info_json_bytes = compat_urllib_request.urlopen(request).read()
2654 info_json = info_json_bytes.decode('utf-8')
2655 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2656 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2659 info = json.loads(info_json)
2660 video_id = info['id']
2661 self.report_extraction('%s/%s' % (uploader, slug_title))
2663 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2664 request = compat_urllib_request.Request(streams_url)
2666 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2667 stream_json = stream_json_bytes.decode('utf-8')
2668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2669 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2672 streams = json.loads(stream_json)
2673 mediaURL = streams['http_mp3_128_url']
2678 'uploader': info['user']['username'],
2679 'upload_date': info['created_at'],
2680 'title': info['title'],
2682 'description': info['description'],
2686 class InfoQIE(InfoExtractor):
2687 """Information extractor for infoq.com"""
2688 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2690 def report_extraction(self, video_id):
2691 """Report information extraction."""
2692 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2694 def _real_extract(self, url):
2695 mobj = re.match(self._VALID_URL, url)
2697 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2700 webpage = self._download_webpage(url, video_id=url)
2701 self.report_extraction(url)
2704 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2706 self._downloader.trouble(u'ERROR: unable to extract video url')
2708 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2709 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2712 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2714 self._downloader.trouble(u'ERROR: unable to extract video title')
2716 video_title = mobj.group(1)
2718 # Extract description
2719 video_description = u'No description available.'
2720 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2721 if mobj is not None:
2722 video_description = mobj.group(1)
2724 video_filename = video_url.split('/')[-1]
2725 video_id, extension = video_filename.split('.')
2731 'upload_date': None,
2732 'title': video_title,
2733 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2735 'description': video_description,
2740 class MixcloudIE(InfoExtractor):
2741 """Information extractor for www.mixcloud.com"""
2743 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2744 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2745 IE_NAME = u'mixcloud'
2747 def __init__(self, downloader=None):
2748 InfoExtractor.__init__(self, downloader)
2750 def report_download_json(self, file_id):
2751 """Report JSON download."""
2752 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2754 def report_extraction(self, file_id):
2755 """Report information extraction."""
2756 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2758 def get_urls(self, jsonData, fmt, bitrate='best'):
2759 """Get urls from 'audio_formats' section in json"""
2762 bitrate_list = jsonData[fmt]
2763 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2764 bitrate = max(bitrate_list) # select highest
2766 url_list = jsonData[fmt][bitrate]
2767 except TypeError: # we have no bitrate info.
2768 url_list = jsonData[fmt]
2771 def check_urls(self, url_list):
2772 """Returns 1st active url from list"""
2773 for url in url_list:
2775 compat_urllib_request.urlopen(url)
2777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2782 def _print_formats(self, formats):
2783 print('Available formats:')
2784 for fmt in formats.keys():
2785 for b in formats[fmt]:
2787 ext = formats[fmt][b][0]
2788 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2789 except TypeError: # we have no bitrate info
2790 ext = formats[fmt][0]
2791 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2794 def _real_extract(self, url):
2795 mobj = re.match(self._VALID_URL, url)
2797 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2799 # extract uploader & filename from url
2800 uploader = mobj.group(1).decode('utf-8')
2801 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2803 # construct API request
2804 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2805 # retrieve .json file with links to files
2806 request = compat_urllib_request.Request(file_url)
2808 self.report_download_json(file_url)
2809 jsonData = compat_urllib_request.urlopen(request).read()
2810 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2811 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2815 json_data = json.loads(jsonData)
2816 player_url = json_data['player_swf_url']
2817 formats = dict(json_data['audio_formats'])
2819 req_format = self._downloader.params.get('format', None)
2822 if self._downloader.params.get('listformats', None):
2823 self._print_formats(formats)
2826 if req_format is None or req_format == 'best':
2827 for format_param in formats.keys():
2828 url_list = self.get_urls(formats, format_param)
2830 file_url = self.check_urls(url_list)
2831 if file_url is not None:
2834 if req_format not in formats:
2835 self._downloader.trouble(u'ERROR: format is not available')
2838 url_list = self.get_urls(formats, req_format)
2839 file_url = self.check_urls(url_list)
2840 format_param = req_format
2843 'id': file_id.decode('utf-8'),
2844 'url': file_url.decode('utf-8'),
2845 'uploader': uploader.decode('utf-8'),
2846 'upload_date': None,
2847 'title': json_data['name'],
2848 'ext': file_url.split('.')[-1].decode('utf-8'),
2849 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2850 'thumbnail': json_data['thumbnail_url'],
2851 'description': json_data['description'],
2852 'player_url': player_url.decode('utf-8'),
2855 class StanfordOpenClassroomIE(InfoExtractor):
2856 """Information extractor for Stanford's Open ClassRoom"""
2858 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2859 IE_NAME = u'stanfordoc'
2861 def report_download_webpage(self, objid):
2862 """Report information extraction."""
2863 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2865 def report_extraction(self, video_id):
2866 """Report information extraction."""
2867 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2869 def _real_extract(self, url):
2870 mobj = re.match(self._VALID_URL, url)
2872 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2875 if mobj.group('course') and mobj.group('video'): # A specific video
2876 course = mobj.group('course')
2877 video = mobj.group('video')
2879 'id': course + '_' + video,
2881 'upload_date': None,
2884 self.report_extraction(info['id'])
2885 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2886 xmlUrl = baseUrl + video + '.xml'
2888 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2889 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2890 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2892 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2894 info['title'] = mdoc.findall('./title')[0].text
2895 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2897 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2899 info['ext'] = info['url'].rpartition('.')[2]
2901 elif mobj.group('course'): # A course page
2902 course = mobj.group('course')
2907 'upload_date': None,
2910 self.report_download_webpage(info['id'])
2912 coursepage = compat_urllib_request.urlopen(url).read()
2913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2914 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2917 m = re.search('<h1>([^<]+)</h1>', coursepage)
2919 info['title'] = unescapeHTML(m.group(1))
2921 info['title'] = info['id']
2923 m = re.search('<description>([^<]+)</description>', coursepage)
2925 info['description'] = unescapeHTML(m.group(1))
2927 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2930 'type': 'reference',
2931 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2935 for entry in info['list']:
2936 assert entry['type'] == 'reference'
2937 results += self.extract(entry['url'])
2942 'id': 'Stanford OpenClassroom',
2945 'upload_date': None,
2948 self.report_download_webpage(info['id'])
2949 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2951 rootpage = compat_urllib_request.urlopen(rootURL).read()
2952 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2953 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2956 info['title'] = info['id']
2958 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2961 'type': 'reference',
2962 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2967 for entry in info['list']:
2968 assert entry['type'] == 'reference'
2969 results += self.extract(entry['url'])
2972 class MTVIE(InfoExtractor):
2973 """Information extractor for MTV.com"""
2975 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2978 def report_extraction(self, video_id):
2979 """Report information extraction."""
2980 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2982 def _real_extract(self, url):
2983 mobj = re.match(self._VALID_URL, url)
2985 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2987 if not mobj.group('proto'):
2988 url = 'http://' + url
2989 video_id = mobj.group('videoid')
2991 webpage = self._download_webpage(url, video_id)
2993 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2995 self._downloader.trouble(u'ERROR: unable to extract song name')
2997 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2998 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3000 self._downloader.trouble(u'ERROR: unable to extract performer')
3002 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3003 video_title = performer + ' - ' + song_name
3005 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3007 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3009 mtvn_uri = mobj.group(1)
3011 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3013 self._downloader.trouble(u'ERROR: unable to extract content id')
3015 content_id = mobj.group(1)
3017 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3018 self.report_extraction(video_id)
3019 request = compat_urllib_request.Request(videogen_url)
3021 metadataXml = compat_urllib_request.urlopen(request).read()
3022 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3023 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3026 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3027 renditions = mdoc.findall('.//rendition')
3029 # For now, always pick the highest quality.
3030 rendition = renditions[-1]
3033 _,_,ext = rendition.attrib['type'].partition('/')
3034 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3035 video_url = rendition.find('./src').text
3037 self._downloader.trouble('Invalid rendition field.')
3043 'uploader': performer,
3044 'upload_date': None,
3045 'title': video_title,
3053 class YoukuIE(InfoExtractor):
3054 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3056 def report_download_webpage(self, file_id):
3057 """Report webpage download."""
3058 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3060 def report_extraction(self, file_id):
3061 """Report information extraction."""
3062 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3065 nowTime = int(time.time() * 1000)
3066 random1 = random.randint(1000,1998)
3067 random2 = random.randint(1000,9999)
3069 return "%d%d%d" %(nowTime,random1,random2)
3071 def _get_file_ID_mix_string(self, seed):
3073 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3075 for i in range(len(source)):
3076 seed = (seed * 211 + 30031 ) % 65536
3077 index = math.floor(seed / 65536 * len(source) )
3078 mixed.append(source[int(index)])
3079 source.remove(source[int(index)])
3080 #return ''.join(mixed)
3083 def _get_file_id(self, fileId, seed):
3084 mixed = self._get_file_ID_mix_string(seed)
3085 ids = fileId.split('*')
3089 realId.append(mixed[int(ch)])
3090 return ''.join(realId)
3092 def _real_extract(self, url):
3093 mobj = re.match(self._VALID_URL, url)
3095 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3097 video_id = mobj.group('ID')
3099 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3101 request = compat_urllib_request.Request(info_url, None, std_headers)
3103 self.report_download_webpage(video_id)
3104 jsondata = compat_urllib_request.urlopen(request).read()
3105 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3106 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3109 self.report_extraction(video_id)
3111 jsonstr = jsondata.decode('utf-8')
3112 config = json.loads(jsonstr)
3114 video_title = config['data'][0]['title']
3115 seed = config['data'][0]['seed']
3117 format = self._downloader.params.get('format', None)
3118 supported_format = list(config['data'][0]['streamfileids'].keys())
3120 if format is None or format == 'best':
3121 if 'hd2' in supported_format:
3126 elif format == 'worst':
3134 fileid = config['data'][0]['streamfileids'][format]
3135 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3136 except (UnicodeDecodeError, ValueError, KeyError):
3137 self._downloader.trouble(u'ERROR: unable to extract info section')
3141 sid = self._gen_sid()
3142 fileid = self._get_file_id(fileid, seed)
3144 #column 8,9 of fileid represent the segment number
3145 #fileid[7:9] should be changed
3146 for index, key in enumerate(keys):
3148 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3149 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3152 'id': '%s_part%02d' % (video_id, index),
3153 'url': download_url,
3155 'upload_date': None,
3156 'title': video_title,
3159 files_info.append(info)
3164 class XNXXIE(InfoExtractor):
3165 """Information extractor for xnxx.com"""
3167 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3169 VIDEO_URL_RE = r'flv_url=(.*?)&'
3170 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3171 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3173 def report_webpage(self, video_id):
3174 """Report information extraction"""
3175 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3177 def report_extraction(self, video_id):
3178 """Report information extraction"""
3179 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3181 def _real_extract(self, url):
3182 mobj = re.match(self._VALID_URL, url)
3184 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3186 video_id = mobj.group(1)
3188 self.report_webpage(video_id)
3190 # Get webpage content
3192 webpage_bytes = compat_urllib_request.urlopen(url).read()
3193 webpage = webpage_bytes.decode('utf-8')
3194 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3195 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3198 result = re.search(self.VIDEO_URL_RE, webpage)
3200 self._downloader.trouble(u'ERROR: unable to extract video url')
3202 video_url = compat_urllib_parse.unquote(result.group(1))
3204 result = re.search(self.VIDEO_TITLE_RE, webpage)
3206 self._downloader.trouble(u'ERROR: unable to extract video title')
3208 video_title = result.group(1)
3210 result = re.search(self.VIDEO_THUMB_RE, webpage)
3212 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3214 video_thumbnail = result.group(1)
3220 'upload_date': None,
3221 'title': video_title,
3223 'thumbnail': video_thumbnail,
3224 'description': None,
3228 class GooglePlusIE(InfoExtractor):
3229 """Information extractor for plus.google.com."""
3231 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3232 IE_NAME = u'plus.google'
3234 def __init__(self, downloader=None):
3235 InfoExtractor.__init__(self, downloader)
3237 def report_extract_entry(self, url):
3238 """Report downloading extry"""
3239 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3241 def report_date(self, upload_date):
3242 """Report downloading extry"""
3243 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3245 def report_uploader(self, uploader):
3246 """Report downloading extry"""
3247 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3249 def report_title(self, video_title):
3250 """Report downloading extry"""
3251 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3253 def report_extract_vid_page(self, video_page):
3254 """Report information extraction."""
3255 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3257 def _real_extract(self, url):
3258 # Extract id from URL
3259 mobj = re.match(self._VALID_URL, url)
3261 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3264 post_url = mobj.group(0)
3265 video_id = mobj.group(1)
3267 video_extension = 'flv'
3269 # Step 1, Retrieve post webpage to extract further information
3270 self.report_extract_entry(post_url)
3271 request = compat_urllib_request.Request(post_url)
3273 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3274 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3275 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3278 # Extract update date
3280 pattern = 'title="Timestamp">(.*?)</a>'
3281 mobj = re.search(pattern, webpage)
3283 upload_date = mobj.group(1)
3284 # Convert timestring to a format suitable for filename
3285 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3286 upload_date = upload_date.strftime('%Y%m%d')
3287 self.report_date(upload_date)
3291 pattern = r'rel\="author".*?>(.*?)</a>'
3292 mobj = re.search(pattern, webpage)
3294 uploader = mobj.group(1)
3295 self.report_uploader(uploader)
3298 # Get the first line for title
3300 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3301 mobj = re.search(pattern, webpage)
3303 video_title = mobj.group(1)
3304 self.report_title(video_title)
3306 # Step 2, Stimulate clicking the image box to launch video
3307 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3308 mobj = re.search(pattern, webpage)
3310 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3312 video_page = mobj.group(1)
3313 request = compat_urllib_request.Request(video_page)
3315 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3316 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3317 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3319 self.report_extract_vid_page(video_page)
3322 # Extract video links on video page
3323 """Extract video links of all sizes"""
3324 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3325 mobj = re.findall(pattern, webpage)
3327 self._downloader.trouble(u'ERROR: unable to extract video links')
3329 # Sort in resolution
3330 links = sorted(mobj)
3332 # Choose the lowest of the sort, i.e. highest resolution
3333 video_url = links[-1]
3334 # Only get the url. The resolution part in the tuple has no use anymore
3335 video_url = video_url[-1]
3336 # Treat escaped \u0026 style hex
3338 video_url = video_url.decode("unicode_escape")
3339 except AttributeError: # Python 3
3340 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3346 'uploader': uploader,
3347 'upload_date': upload_date,
3348 'title': video_title,
3349 'ext': video_extension,
3352 class NBAIE(InfoExtractor):
3353 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3356 def _real_extract(self, url):
3357 mobj = re.match(self._VALID_URL, url)
3359 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3362 video_id = mobj.group(1)
3363 if video_id.endswith('/index.html'):
3364 video_id = video_id[:-len('/index.html')]
3366 webpage = self._download_webpage(url, video_id)
3368 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3369 def _findProp(rexp, default=None):
3370 m = re.search(rexp, webpage)
3372 return unescapeHTML(m.group(1))
3376 shortened_video_id = video_id.rpartition('/')[2]
3377 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3379 'id': shortened_video_id,
3383 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3384 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3388 class JustinTVIE(InfoExtractor):
3389 """Information extractor for justin.tv and twitch.tv"""
3390 # TODO: One broadcast may be split into multiple videos. The key
3391 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3392 # starts at 1 and increases. Can we treat all parts as one video?
3394 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3395 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3396 _JUSTIN_PAGE_LIMIT = 100
3397 IE_NAME = u'justin.tv'
3399 def report_extraction(self, file_id):
3400 """Report information extraction."""
3401 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3403 def report_download_page(self, channel, offset):
3404 """Report attempt to download a single page of videos."""
3405 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3406 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3408 # Return count of items, list of *valid* items
3409 def _parse_page(self, url):
3411 urlh = compat_urllib_request.urlopen(url)
3412 webpage_bytes = urlh.read()
3413 webpage = webpage_bytes.decode('utf-8', 'ignore')
3414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3415 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3418 response = json.loads(webpage)
3419 if type(response) != list:
3420 error_text = response.get('error', 'unknown error')
3421 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3424 for clip in response:
3425 video_url = clip['video_file_url']
3427 video_extension = os.path.splitext(video_url)[1][1:]
3428 video_date = re.sub('-', '', clip['start_time'][:10])
3429 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3430 video_id = clip['id']
3431 video_title = clip.get('title', video_id)
3435 'title': video_title,
3436 'uploader': clip.get('channel_name', video_uploader_id),
3437 'uploader_id': video_uploader_id,
3438 'upload_date': video_date,
3439 'ext': video_extension,
3441 return (len(response), info)
3443 def _real_extract(self, url):
3444 mobj = re.match(self._VALID_URL, url)
3446 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3449 api = 'http://api.justin.tv'
3450 video_id = mobj.group(mobj.lastindex)
3452 if mobj.lastindex == 1:
3454 api += '/channel/archives/%s.json'
3456 api += '/broadcast/by_archive/%s.json'
3457 api = api % (video_id,)
3459 self.report_extraction(video_id)
3463 limit = self._JUSTIN_PAGE_LIMIT
3466 self.report_download_page(video_id, offset)
3467 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3468 page_count, page_info = self._parse_page(page_url)
3469 info.extend(page_info)
3470 if not paged or page_count != limit:
3475 class FunnyOrDieIE(InfoExtractor):
3476 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3478 def _real_extract(self, url):
3479 mobj = re.match(self._VALID_URL, url)
3481 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3484 video_id = mobj.group('id')
3485 webpage = self._download_webpage(url, video_id)
3487 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3489 self._downloader.trouble(u'ERROR: unable to find video information')
3490 video_url = unescapeHTML(m.group('url'))
3492 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3494 self._downloader.trouble(u'Cannot find video title')
3495 title = unescapeHTML(m.group('title'))
3497 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3499 desc = unescapeHTML(m.group('desc'))
3508 'description': desc,
3512 class TweetReelIE(InfoExtractor):
3513 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3515 def _real_extract(self, url):
3516 mobj = re.match(self._VALID_URL, url)
3518 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3521 video_id = mobj.group('id')
3522 webpage = self._download_webpage(url, video_id)
3524 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3526 self._downloader.trouble(u'ERROR: Cannot find status ID')
3527 status_id = m.group(1)
3529 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3531 self._downloader.trouble(u'WARNING: Cannot find description')
3532 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3534 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3536 self._downloader.trouble(u'ERROR: Cannot find uploader')
3537 uploader = unescapeHTML(m.group('uploader'))
3538 uploader_id = unescapeHTML(m.group('uploader_id'))
3540 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3542 self._downloader.trouble(u'ERROR: Cannot find upload date')
3543 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3546 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3553 'description': desc,
3554 'uploader': uploader,
3555 'uploader_id': uploader_id,
3556 'internal_id': status_id,
3557 'upload_date': upload_date
3561 class SteamIE(InfoExtractor):
3562 _VALID_URL = r"""http://store.steampowered.com/
3563 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3565 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3568 def suitable(self, url):
3569 """Receives a URL and returns True if suitable for this IE."""
3570 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3572 def _real_extract(self, url):
3573 m = re.match(self._VALID_URL, url, re.VERBOSE)
3574 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3575 gameID = m.group('gameID')
3576 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3577 webpage = self._download_webpage(videourl, gameID)
3578 mweb = re.finditer(urlRE, webpage)
3579 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3580 titles = re.finditer(namesRE, webpage)
3582 for vid,vtitle in zip(mweb,titles):
3583 video_id = vid.group('videoID')
3584 title = vtitle.group('videoName')
3585 video_url = vid.group('videoURL')
3587 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3592 'title': unescapeHTML(title)
3597 class UstreamIE(InfoExtractor):
3598 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3599 IE_NAME = u'ustream'
3601 def _real_extract(self, url):
3602 m = re.match(self._VALID_URL, url)
3603 video_id = m.group('videoID')
3604 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3605 webpage = self._download_webpage(url, video_id)
3606 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3607 title = m.group('title')
3608 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3609 uploader = m.group('uploader')
3615 'uploader': uploader
3619 class RBMARadioIE(InfoExtractor):
3620 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3622 def _real_extract(self, url):
3623 m = re.match(self._VALID_URL, url)
3624 video_id = m.group('videoID')
3626 webpage = self._download_webpage(url, video_id)
3627 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3629 raise ExtractorError(u'Cannot find metadata')
3630 json_data = m.group(1)
3633 data = json.loads(json_data)
3634 except ValueError as e:
3635 raise ExtractorError(u'Invalid JSON: ' + str(e))
3637 video_url = data['akamai_url'] + '&cbr=256'
3638 url_parts = compat_urllib_parse_urlparse(video_url)
3639 video_ext = url_parts.path.rpartition('.')[2]
3644 'title': data['title'],
3645 'description': data.get('teaser_text'),
3646 'location': data.get('country_of_origin'),
3647 'uploader': data.get('host', {}).get('name'),
3648 'uploader_id': data.get('host', {}).get('slug'),
3649 'thumbnail': data.get('image', {}).get('large_url_2x'),
3650 'duration': data.get('duration'),
3655 class YouPornIE(InfoExtractor):
3656 """Information extractor for youporn.com."""
3657 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3659 def _print_formats(self, formats):
3660 """Print all available formats"""
3661 print(u'Available formats:')
3662 print(u'ext\t\tformat')
3663 print(u'---------------------------------')
3664 for format in formats:
3665 print(u'%s\t\t%s' % (format['ext'], format['format']))
3667 def _specific(self, req_format, formats):
3669 if(x["format"]==req_format):
3673 def _real_extract(self, url):
3674 mobj = re.match(self._VALID_URL, url)
3676 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3679 video_id = mobj.group('videoid')
3681 req = compat_urllib_request.Request(url)
3682 req.add_header('Cookie', 'age_verified=1')
3683 webpage = self._download_webpage(req, video_id)
3685 # Get the video title
3686 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3688 raise ExtractorError(u'ERROR: unable to extract video title')
3689 video_title = result.group('title').strip()
3691 # Get the video date
3692 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3694 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3697 upload_date = result.group('date').strip()
3699 # Get the video uploader
3700 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3702 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3703 video_uploader = None
3705 video_uploader = result.group('uploader').strip()
3706 video_uploader = clean_html( video_uploader )
3708 # Get all of the formats available
3709 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3710 result = re.search(DOWNLOAD_LIST_RE, webpage)
3712 raise ExtractorError(u'Unable to extract download list')
3713 download_list_html = result.group('download_list').strip()
3715 # Get all of the links from the page
3716 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3717 links = re.findall(LINK_RE, download_list_html)
3718 if(len(links) == 0):
3719 raise ExtractorError(u'ERROR: no known formats available for video')
3721 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3726 # A link looks like this:
3727 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3728 # A path looks like this:
3729 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3730 video_url = unescapeHTML( link )
3731 path = compat_urllib_parse_urlparse( video_url ).path
3732 extension = os.path.splitext( path )[1][1:]
3733 format = path.split('/')[4].split('_')[:2]
3736 format = "-".join( format )
3737 title = u'%s-%s-%s' % (video_title, size, bitrate)
3742 'uploader': video_uploader,
3743 'upload_date': upload_date,
3748 'description': None,
3752 if self._downloader.params.get('listformats', None):
3753 self._print_formats(formats)
3756 req_format = self._downloader.params.get('format', None)
3757 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3759 if req_format is None or req_format == 'best':
3761 elif req_format == 'worst':
3762 return [formats[-1]]
3763 elif req_format in ('-1', 'all'):
3766 format = self._specific( req_format, formats )
3768 self._downloader.trouble(u'ERROR: requested format not available')
3774 class PornotubeIE(InfoExtractor):
3775 """Information extractor for pornotube.com."""
3776 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3778 def _real_extract(self, url):
3779 mobj = re.match(self._VALID_URL, url)
3781 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3784 video_id = mobj.group('videoid')
3785 video_title = mobj.group('title')
3787 # Get webpage content
3788 webpage = self._download_webpage(url, video_id)
3791 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3792 result = re.search(VIDEO_URL_RE, webpage)
3794 self._downloader.trouble(u'ERROR: unable to extract video url')
3796 video_url = compat_urllib_parse.unquote(result.group('url'))
3798 #Get the uploaded date
3799 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3800 result = re.search(VIDEO_UPLOADED_RE, webpage)
3802 self._downloader.trouble(u'ERROR: unable to extract video title')
3804 upload_date = result.group('date')
3806 info = {'id': video_id,
3809 'upload_date': upload_date,
3810 'title': video_title,
3816 class YouJizzIE(InfoExtractor):
3817 """Information extractor for youjizz.com."""
3818 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3820 def _real_extract(self, url):
3821 mobj = re.match(self._VALID_URL, url)
3823 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3826 video_id = mobj.group('videoid')
3828 # Get webpage content
3829 webpage = self._download_webpage(url, video_id)
3831 # Get the video title
3832 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3834 raise ExtractorError(u'ERROR: unable to extract video title')
3835 video_title = result.group('title').strip()
3837 # Get the embed page
3838 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3840 raise ExtractorError(u'ERROR: unable to extract embed page')
3842 embed_page_url = result.group(0).strip()
3843 video_id = result.group('videoid')
3845 webpage = self._download_webpage(embed_page_url, video_id)
3848 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3850 raise ExtractorError(u'ERROR: unable to extract video url')
3851 video_url = result.group('source')
3853 info = {'id': video_id,
3855 'title': video_title,
3858 'player_url': embed_page_url}
3862 class EightTracksIE(InfoExtractor):
3864 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/]+)'
3866 def _real_extract(self, url):
3867 mobj = re.match(self._VALID_URL, url)
3869 raise ExtractorError(u'Invalid URL: %s' % url)
3870 playlist_id = mobj.group('id')
3872 webpage = self._download_webpage(url, playlist_id)
3874 m = re.search(r"new TRAX.Mix\((.*?)\);\n*\s*TRAX.initSearchAutocomplete\('#search'\);", webpage, flags=re.DOTALL)
3876 raise ExtractorError(u'Cannot find trax information')
3877 json_like = m.group(1)
3878 data = json.loads(json_like)
3880 session = str(random.randint(0, 1000000000))
3882 track_count = data['tracks_count']
3883 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3884 next_url = first_url
3886 for i in itertools.count():
3887 api_json = self._download_webpage(next_url, playlist_id,
3888 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3889 errnote=u'Failed to download song information')
3890 api_data = json.loads(api_json)
3891 track_data = api_data[u'set']['track']
3893 'id': track_data['id'],
3894 'url': track_data['track_file_stream_url'],
3895 'title': track_data['name'],
3896 'uploader': track_data['performer'],
3900 if api_data['set']['at_last_track']:
3902 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3905 def gen_extractors():
3906 """ Return a list of an instance of every supported extractor.
3907 The order does matter; the first extractor matched is the one handling the URL.
3910 YoutubePlaylistIE(),
3934 StanfordOpenClassroomIE(),