2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
14 import xml.etree.ElementTree
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
39 The following fields are optional:
41 format: The video format, defaults to ext (used for --get-format)
42 thumbnail: Full URL to a video thumbnail image.
43 description: One-line video description.
44 uploader: Full name of the video uploader.
45 upload_date: Video upload date (YYYYMMDD).
46 uploader_id: Nickname or id of the video uploader.
47 location: Physical location of the video.
48 player_url: SWF Player URL (used for rtmpdump).
49 subtitles: The .srt file contents.
50 urlhandle: [internal] The urlHandle to be used to download the file,
51 like returned by urllib.request.urlopen
53 The fields should all be Unicode strings.
55 Subclasses of this one should re-define the _real_initialize() and
56 _real_extract() methods and define a _VALID_URL regexp.
57 Probably, they should also be added to the list of extractors.
59 _real_extract() must return a *list* of information dictionaries as
62 Finally, the _WORKING attribute should be set to False for broken IEs
63 in order to warn the users and skip the tests.
70 def __init__(self, downloader=None):
71 """Constructor. Receives an optional downloader."""
73 self.set_downloader(downloader)
75 def suitable(self, url):
76 """Receives a URL and returns True if suitable for this IE."""
77 return re.match(self._VALID_URL, url) is not None
80 """Getter method for _WORKING."""
84 """Initializes an instance (authentication, etc)."""
86 self._real_initialize()
89 def extract(self, url):
90 """Extracts URL information and returns it in list of dicts."""
92 return self._real_extract(url)
94 def set_downloader(self, downloader):
95 """Sets the downloader for this IE."""
96 self._downloader = downloader
98 def _real_initialize(self):
99 """Real initialization process. Redefine in subclasses."""
102 def _real_extract(self, url):
103 """Real extraction process. Redefine in subclasses."""
108 return type(self).__name__[:-2]
110 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
111 """ Returns the response handle """
113 note = u'Downloading video webpage'
114 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
116 return compat_urllib_request.urlopen(url_or_request)
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
122 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
123 """ Returns the data of the page as a string """
124 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
125 webpage_bytes = urlh.read()
126 return webpage_bytes.decode('utf-8', 'replace')
129 class YoutubeIE(InfoExtractor):
130 """Information extractor for youtube.com."""
134 (?:https?://)? # http(s):// (optional)
135 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
136 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
137 (?:.*?\#/)? # handle anchor (#/) redirect urls
138 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
139 (?: # the various things that can precede the ID:
140 (?:(?:v|embed|e)/) # v/ or embed/ or e/
141 |(?: # or the v= param in all its forms
142 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
143 (?:\?|\#!?) # the params delimiter ? or # or #!
144 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
147 )? # optional -> youtube.com/xxxx is OK
148 )? # all until now is optional -> you can pass the naked ID
149 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
150 (?(1).+)? # if we found the ID, everything can follow
152 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
153 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
154 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
155 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
156 _NETRC_MACHINE = 'youtube'
157 # Listed in order of quality
158 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
159 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
160 _video_extensions = {
166 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
172 _video_dimensions = {
190 def suitable(self, url):
191 """Receives a URL and returns True if suitable for this IE."""
192 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
194 def report_lang(self):
195 """Report attempt to set language."""
196 self._downloader.to_screen(u'[youtube] Setting language')
198 def report_login(self):
199 """Report attempt to log in."""
200 self._downloader.to_screen(u'[youtube] Logging in')
202 def report_age_confirmation(self):
203 """Report attempt to confirm age."""
204 self._downloader.to_screen(u'[youtube] Confirming age')
206 def report_video_webpage_download(self, video_id):
207 """Report attempt to download video webpage."""
208 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
210 def report_video_info_webpage_download(self, video_id):
211 """Report attempt to download video info webpage."""
212 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
214 def report_video_subtitles_download(self, video_id):
215 """Report attempt to download video info webpage."""
216 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
218 def report_information_extraction(self, video_id):
219 """Report attempt to extract video information."""
220 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
222 def report_unavailable_format(self, video_id, format):
223 """Report extracted video URL."""
224 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
226 def report_rtmp_download(self):
227 """Indicate the download will use the RTMP protocol."""
228 self._downloader.to_screen(u'[youtube] RTMP download detected')
230 def _closed_captions_xml_to_srt(self, xml_string):
232 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
233 # TODO parse xml instead of regex
234 for n, (start, dur_tag, dur, caption) in enumerate(texts):
235 if not dur: dur = '4'
237 end = start + float(dur)
238 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
239 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
240 caption = unescapeHTML(caption)
241 caption = unescapeHTML(caption) # double cycle, intentional
242 srt += str(n+1) + '\n'
243 srt += start + ' --> ' + end + '\n'
244 srt += caption + '\n\n'
247 def _extract_subtitles(self, video_id):
248 self.report_video_subtitles_download(video_id)
249 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
251 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
252 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
253 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
254 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
255 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
256 if not srt_lang_list:
257 return (u'WARNING: video has no closed captions', None)
258 if self._downloader.params.get('subtitleslang', False):
259 srt_lang = self._downloader.params.get('subtitleslang')
260 elif 'en' in srt_lang_list:
263 srt_lang = list(srt_lang_list.keys())[0]
264 if not srt_lang in srt_lang_list:
265 return (u'WARNING: no closed captions found in the specified language', None)
266 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
268 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
269 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
270 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
272 return (u'WARNING: unable to download video subtitles', None)
273 return (None, self._closed_captions_xml_to_srt(srt_xml))
275 def _print_formats(self, formats):
276 print('Available formats:')
278 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
280 def _real_initialize(self):
281 if self._downloader is None:
286 downloader_params = self._downloader.params
288 # Attempt to use provided username and password or .netrc data
289 if downloader_params.get('username', None) is not None:
290 username = downloader_params['username']
291 password = downloader_params['password']
292 elif downloader_params.get('usenetrc', False):
294 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
299 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
300 except (IOError, netrc.NetrcParseError) as err:
301 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
305 request = compat_urllib_request.Request(self._LANG_URL)
308 compat_urllib_request.urlopen(request).read()
309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
313 # No authentication to be performed
319 'current_form': 'loginForm',
321 'action_login': 'Log In',
322 'username': username,
323 'password': password,
325 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
328 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
329 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
330 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
333 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
339 'action_confirm': 'Confirm',
341 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
343 self.report_age_confirmation()
344 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
345 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
346 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
349 def _extract_id(self, url):
350 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
352 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
354 video_id = mobj.group(2)
357 def _real_extract(self, url):
358 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
359 mobj = re.search(self._NEXT_URL_RE, url)
361 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
362 video_id = self._extract_id(url)
365 self.report_video_webpage_download(video_id)
366 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
367 request = compat_urllib_request.Request(url)
369 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
374 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
376 # Attempt to extract SWF player URL
377 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
379 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
384 self.report_video_info_webpage_download(video_id)
385 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
386 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
387 % (video_id, el_type))
388 request = compat_urllib_request.Request(video_info_url)
390 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
391 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
392 video_info = compat_parse_qs(video_info_webpage)
393 if 'token' in video_info:
395 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
396 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
398 if 'token' not in video_info:
399 if 'reason' in video_info:
400 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
402 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
405 # Check for "rental" videos
406 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
407 self._downloader.trouble(u'ERROR: "rental" videos not supported')
410 # Start extracting information
411 self.report_information_extraction(video_id)
414 if 'author' not in video_info:
415 self._downloader.trouble(u'ERROR: unable to extract uploader name')
417 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
420 video_uploader_id = None
421 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
423 video_uploader_id = mobj.group(1)
425 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
428 if 'title' not in video_info:
429 self._downloader.trouble(u'ERROR: unable to extract video title')
431 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
434 if 'thumbnail_url' not in video_info:
435 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
437 else: # don't panic if we can't find it
438 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
442 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
444 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
445 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
446 for expression in format_expressions:
448 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
453 video_description = get_element_by_id("eow-description", video_webpage)
454 if video_description:
455 video_description = clean_html(video_description)
457 video_description = ''
460 video_subtitles = None
461 if self._downloader.params.get('writesubtitles', False):
462 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
464 self._downloader.trouble(srt_error)
466 if 'length_seconds' not in video_info:
467 self._downloader.trouble(u'WARNING: unable to extract video duration')
470 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
473 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
475 # Decide which formats to download
476 req_format = self._downloader.params.get('format', None)
478 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
479 self.report_rtmp_download()
480 video_url_list = [(None, video_info['conn'][0])]
481 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
482 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
483 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
484 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
485 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
487 format_limit = self._downloader.params.get('format_limit', None)
488 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
489 if format_limit is not None and format_limit in available_formats:
490 format_list = available_formats[available_formats.index(format_limit):]
492 format_list = available_formats
493 existing_formats = [x for x in format_list if x in url_map]
494 if len(existing_formats) == 0:
495 self._downloader.trouble(u'ERROR: no known formats available for video')
497 if self._downloader.params.get('listformats', None):
498 self._print_formats(existing_formats)
500 if req_format is None or req_format == 'best':
501 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
502 elif req_format == 'worst':
503 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
504 elif req_format in ('-1', 'all'):
505 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
507 # Specific formats. We pick the first in a slash-delimeted sequence.
508 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
509 req_formats = req_format.split('/')
510 video_url_list = None
511 for rf in req_formats:
513 video_url_list = [(rf, url_map[rf])]
515 if video_url_list is None:
516 self._downloader.trouble(u'ERROR: requested format not available')
519 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
523 for format_param, video_real_url in video_url_list:
525 video_extension = self._video_extensions.get(format_param, 'flv')
527 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
528 self._video_dimensions.get(format_param, '???'))
532 'url': video_real_url,
533 'uploader': video_uploader,
534 'uploader_id': video_uploader_id,
535 'upload_date': upload_date,
536 'title': video_title,
537 'ext': video_extension,
538 'format': video_format,
539 'thumbnail': video_thumbnail,
540 'description': video_description,
541 'player_url': player_url,
542 'subtitles': video_subtitles,
543 'duration': video_duration
548 class MetacafeIE(InfoExtractor):
549 """Information Extractor for metacafe.com."""
551 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
552 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
553 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
554 IE_NAME = u'metacafe'
556 def __init__(self, downloader=None):
557 InfoExtractor.__init__(self, downloader)
559 def report_disclaimer(self):
560 """Report disclaimer retrieval."""
561 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
563 def report_age_confirmation(self):
564 """Report attempt to confirm age."""
565 self._downloader.to_screen(u'[metacafe] Confirming age')
567 def report_download_webpage(self, video_id):
568 """Report webpage download."""
569 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
571 def report_extraction(self, video_id):
572 """Report information extraction."""
573 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
575 def _real_initialize(self):
576 # Retrieve disclaimer
577 request = compat_urllib_request.Request(self._DISCLAIMER)
579 self.report_disclaimer()
580 disclaimer = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
588 'submit': "Continue - I'm over 18",
590 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
592 self.report_age_confirmation()
593 disclaimer = compat_urllib_request.urlopen(request).read()
594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
598 def _real_extract(self, url):
599 # Extract id and simplified title from URL
600 mobj = re.match(self._VALID_URL, url)
602 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
605 video_id = mobj.group(1)
607 # Check if video comes from YouTube
608 mobj2 = re.match(r'^yt-(.*)$', video_id)
609 if mobj2 is not None:
610 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
613 # Retrieve video webpage to extract further information
614 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
616 self.report_download_webpage(video_id)
617 webpage = compat_urllib_request.urlopen(request).read()
618 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
619 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
622 # Extract URL, uploader and title from webpage
623 self.report_extraction(video_id)
624 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
626 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
627 video_extension = mediaURL[-3:]
629 # Extract gdaKey if available
630 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
634 gdaKey = mobj.group(1)
635 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
637 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
639 self._downloader.trouble(u'ERROR: unable to extract media URL')
641 vardict = compat_parse_qs(mobj.group(1))
642 if 'mediaData' not in vardict:
643 self._downloader.trouble(u'ERROR: unable to extract media URL')
645 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
647 self._downloader.trouble(u'ERROR: unable to extract media URL')
649 mediaURL = mobj.group(1).replace('\\/', '/')
650 video_extension = mediaURL[-3:]
651 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
653 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
655 self._downloader.trouble(u'ERROR: unable to extract title')
657 video_title = mobj.group(1).decode('utf-8')
659 mobj = re.search(r'submitter=(.*?);', webpage)
661 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
663 video_uploader = mobj.group(1)
666 'id': video_id.decode('utf-8'),
667 'url': video_url.decode('utf-8'),
668 'uploader': video_uploader.decode('utf-8'),
670 'title': video_title,
671 'ext': video_extension.decode('utf-8'),
675 class DailymotionIE(InfoExtractor):
676 """Information Extractor for Dailymotion"""
678 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
679 IE_NAME = u'dailymotion'
681 def __init__(self, downloader=None):
682 InfoExtractor.__init__(self, downloader)
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
688 def _real_extract(self, url):
689 # Extract id and simplified title from URL
690 mobj = re.match(self._VALID_URL, url)
692 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
695 video_id = mobj.group(1).split('_')[0].split('?')[0]
697 video_extension = 'mp4'
699 # Retrieve video webpage to extract further information
700 request = compat_urllib_request.Request(url)
701 request.add_header('Cookie', 'family_filter=off')
702 webpage = self._download_webpage(request, video_id)
704 # Extract URL, uploader and title from webpage
705 self.report_extraction(video_id)
706 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
708 self._downloader.trouble(u'ERROR: unable to extract media URL')
710 flashvars = compat_urllib_parse.unquote(mobj.group(1))
712 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
715 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
718 self._downloader.trouble(u'ERROR: unable to extract video URL')
721 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
723 self._downloader.trouble(u'ERROR: unable to extract video URL')
726 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
728 # TODO: support choosing qualities
730 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
732 self._downloader.trouble(u'ERROR: unable to extract title')
734 video_title = unescapeHTML(mobj.group('title'))
736 video_uploader = None
737 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
739 # lookin for official user
740 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
741 if mobj_official is None:
742 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
744 video_uploader = mobj_official.group(1)
746 video_uploader = mobj.group(1)
748 video_upload_date = None
749 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
751 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
756 'uploader': video_uploader,
757 'upload_date': video_upload_date,
758 'title': video_title,
759 'ext': video_extension,
763 class PhotobucketIE(InfoExtractor):
764 """Information extractor for photobucket.com."""
766 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
767 IE_NAME = u'photobucket'
769 def __init__(self, downloader=None):
770 InfoExtractor.__init__(self, downloader)
772 def report_download_webpage(self, video_id):
773 """Report webpage download."""
774 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
776 def report_extraction(self, video_id):
777 """Report information extraction."""
778 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
780 def _real_extract(self, url):
781 # Extract id from URL
782 mobj = re.match(self._VALID_URL, url)
784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
787 video_id = mobj.group(1)
789 video_extension = 'flv'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
794 self.report_download_webpage(video_id)
795 webpage = compat_urllib_request.urlopen(request).read()
796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
797 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
800 # Extract URL, uploader, and title from webpage
801 self.report_extraction(video_id)
802 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
804 self._downloader.trouble(u'ERROR: unable to extract media URL')
806 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
810 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
812 self._downloader.trouble(u'ERROR: unable to extract title')
814 video_title = mobj.group(1).decode('utf-8')
816 video_uploader = mobj.group(2).decode('utf-8')
819 'id': video_id.decode('utf-8'),
820 'url': video_url.decode('utf-8'),
821 'uploader': video_uploader,
823 'title': video_title,
824 'ext': video_extension.decode('utf-8'),
828 class YahooIE(InfoExtractor):
829 """Information extractor for video.yahoo.com."""
832 # _VALID_URL matches all Yahoo! Video URLs
833 # _VPAGE_URL matches only the extractable '/watch/' URLs
834 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
835 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
836 IE_NAME = u'video.yahoo'
838 def __init__(self, downloader=None):
839 InfoExtractor.__init__(self, downloader)
841 def report_download_webpage(self, video_id):
842 """Report webpage download."""
843 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
845 def report_extraction(self, video_id):
846 """Report information extraction."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
849 def _real_extract(self, url, new_video=True):
850 # Extract ID from URL
851 mobj = re.match(self._VALID_URL, url)
853 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
856 video_id = mobj.group(2)
857 video_extension = 'flv'
859 # Rewrite valid but non-extractable URLs as
860 # extractable English language /watch/ URLs
861 if re.match(self._VPAGE_URL, url) is None:
862 request = compat_urllib_request.Request(url)
864 webpage = compat_urllib_request.urlopen(request).read()
865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
866 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
869 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
871 self._downloader.trouble(u'ERROR: Unable to extract id field')
873 yahoo_id = mobj.group(1)
875 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
877 self._downloader.trouble(u'ERROR: Unable to extract vid field')
879 yahoo_vid = mobj.group(1)
881 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
882 return self._real_extract(url, new_video=False)
884 # Retrieve video webpage to extract further information
885 request = compat_urllib_request.Request(url)
887 self.report_download_webpage(video_id)
888 webpage = compat_urllib_request.urlopen(request).read()
889 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
890 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
893 # Extract uploader and title from webpage
894 self.report_extraction(video_id)
895 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
897 self._downloader.trouble(u'ERROR: unable to extract video title')
899 video_title = mobj.group(1).decode('utf-8')
901 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
903 self._downloader.trouble(u'ERROR: unable to extract video uploader')
905 video_uploader = mobj.group(1).decode('utf-8')
907 # Extract video thumbnail
908 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
910 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
912 video_thumbnail = mobj.group(1).decode('utf-8')
914 # Extract video description
915 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
917 self._downloader.trouble(u'ERROR: unable to extract video description')
919 video_description = mobj.group(1).decode('utf-8')
920 if not video_description:
921 video_description = 'No description available.'
923 # Extract video height and width
924 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
926 self._downloader.trouble(u'ERROR: unable to extract video height')
928 yv_video_height = mobj.group(1)
930 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
932 self._downloader.trouble(u'ERROR: unable to extract video width')
934 yv_video_width = mobj.group(1)
936 # Retrieve video playlist to extract media URL
937 # I'm not completely sure what all these options are, but we
938 # seem to need most of them, otherwise the server sends a 401.
939 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
940 yv_bitrate = '700' # according to Wikipedia this is hard-coded
941 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
942 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
943 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
945 self.report_download_webpage(video_id)
946 webpage = compat_urllib_request.urlopen(request).read()
947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
951 # Extract media URL from playlist XML
952 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
954 self._downloader.trouble(u'ERROR: Unable to extract media URL')
956 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
957 video_url = unescapeHTML(video_url)
960 'id': video_id.decode('utf-8'),
962 'uploader': video_uploader,
964 'title': video_title,
965 'ext': video_extension.decode('utf-8'),
966 'thumbnail': video_thumbnail.decode('utf-8'),
967 'description': video_description,
971 class VimeoIE(InfoExtractor):
972 """Information extractor for vimeo.com."""
974 # _VALID_URL matches Vimeo URLs
975 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
978 def __init__(self, downloader=None):
979 InfoExtractor.__init__(self, downloader)
981 def report_download_webpage(self, video_id):
982 """Report webpage download."""
983 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
985 def report_extraction(self, video_id):
986 """Report information extraction."""
987 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
993 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
996 video_id = mobj.group(1)
998 # Retrieve video webpage to extract further information
999 request = compat_urllib_request.Request(url, None, std_headers)
1001 self.report_download_webpage(video_id)
1002 webpage_bytes = compat_urllib_request.urlopen(request).read()
1003 webpage = webpage_bytes.decode('utf-8')
1004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1005 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1008 # Now we begin extracting as much information as we can from what we
1009 # retrieved. First we extract the information common to all extractors,
1010 # and latter we extract those that are Vimeo specific.
1011 self.report_extraction(video_id)
1013 # Extract the config JSON
1015 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1016 config = json.loads(config)
1018 self._downloader.trouble(u'ERROR: unable to extract info section')
1022 video_title = config["video"]["title"]
1024 # Extract uploader and uploader_id
1025 video_uploader = config["video"]["owner"]["name"]
1026 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1028 # Extract video thumbnail
1029 video_thumbnail = config["video"]["thumbnail"]
1031 # Extract video description
1032 video_description = get_element_by_attribute("itemprop", "description", webpage)
1033 if video_description: video_description = clean_html(video_description)
1034 else: video_description = ''
1036 # Extract upload date
1037 video_upload_date = None
1038 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1039 if mobj is not None:
1040 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1042 # Vimeo specific: extract request signature and timestamp
1043 sig = config['request']['signature']
1044 timestamp = config['request']['timestamp']
1046 # Vimeo specific: extract video codec and quality information
1047 # First consider quality, then codecs, then take everything
1048 # TODO bind to format param
1049 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1050 files = { 'hd': [], 'sd': [], 'other': []}
1051 for codec_name, codec_extension in codecs:
1052 if codec_name in config["video"]["files"]:
1053 if 'hd' in config["video"]["files"][codec_name]:
1054 files['hd'].append((codec_name, codec_extension, 'hd'))
1055 elif 'sd' in config["video"]["files"][codec_name]:
1056 files['sd'].append((codec_name, codec_extension, 'sd'))
1058 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1060 for quality in ('hd', 'sd', 'other'):
1061 if len(files[quality]) > 0:
1062 video_quality = files[quality][0][2]
1063 video_codec = files[quality][0][0]
1064 video_extension = files[quality][0][1]
1065 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1068 self._downloader.trouble(u'ERROR: no known codec found')
1071 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1072 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1077 'uploader': video_uploader,
1078 'uploader_id': video_uploader_id,
1079 'upload_date': video_upload_date,
1080 'title': video_title,
1081 'ext': video_extension,
1082 'thumbnail': video_thumbnail,
1083 'description': video_description,
1087 class ArteTvIE(InfoExtractor):
1088 """arte.tv information extractor."""
1090 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1091 _LIVE_URL = r'index-[0-9]+\.html$'
1093 IE_NAME = u'arte.tv'
1095 def __init__(self, downloader=None):
1096 InfoExtractor.__init__(self, downloader)
1098 def report_download_webpage(self, video_id):
1099 """Report webpage download."""
1100 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1102 def report_extraction(self, video_id):
1103 """Report information extraction."""
1104 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1106 def fetch_webpage(self, url):
1107 request = compat_urllib_request.Request(url)
1109 self.report_download_webpage(url)
1110 webpage = compat_urllib_request.urlopen(request).read()
1111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1112 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1114 except ValueError as err:
1115 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1119 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1120 page = self.fetch_webpage(url)
1121 mobj = re.search(regex, page, regexFlags)
1125 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1128 for (i, key, err) in matchTuples:
1129 if mobj.group(i) is None:
1130 self._downloader.trouble(err)
1133 info[key] = mobj.group(i)
1137 def extractLiveStream(self, url):
1138 video_lang = url.split('/')[-4]
1139 info = self.grep_webpage(
1141 r'src="(.*?/videothek_js.*?\.js)',
1144 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1147 http_host = url.split('/')[2]
1148 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1149 info = self.grep_webpage(
1151 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1152 '(http://.*?\.swf).*?' +
1156 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1157 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1158 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1161 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1163 def extractPlus7Stream(self, url):
1164 video_lang = url.split('/')[-3]
1165 info = self.grep_webpage(
1167 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1170 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1173 next_url = compat_urllib_parse.unquote(info.get('url'))
1174 info = self.grep_webpage(
1176 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1179 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1182 next_url = compat_urllib_parse.unquote(info.get('url'))
1184 info = self.grep_webpage(
1186 r'<video id="(.*?)".*?>.*?' +
1187 '<name>(.*?)</name>.*?' +
1188 '<dateVideo>(.*?)</dateVideo>.*?' +
1189 '<url quality="hd">(.*?)</url>',
1192 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1193 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1194 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1195 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1200 'id': info.get('id'),
1201 'url': compat_urllib_parse.unquote(info.get('url')),
1202 'uploader': u'arte.tv',
1203 'upload_date': info.get('date'),
1204 'title': info.get('title').decode('utf-8'),
1210 def _real_extract(self, url):
1211 video_id = url.split('/')[-1]
1212 self.report_extraction(video_id)
1214 if re.search(self._LIVE_URL, video_id) is not None:
1215 self.extractLiveStream(url)
1218 info = self.extractPlus7Stream(url)
1223 class GenericIE(InfoExtractor):
1224 """Generic last-resort information extractor."""
1227 IE_NAME = u'generic'
1229 def __init__(self, downloader=None):
1230 InfoExtractor.__init__(self, downloader)
1232 def report_download_webpage(self, video_id):
1233 """Report webpage download."""
1234 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1235 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1237 def report_extraction(self, video_id):
1238 """Report information extraction."""
1239 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1241 def report_following_redirect(self, new_url):
1242 """Report information extraction."""
1243 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1245 def _test_redirect(self, url):
1246 """Check if it is a redirect, like url shorteners, in case restart chain."""
1247 class HeadRequest(compat_urllib_request.Request):
1248 def get_method(self):
1251 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1253 Subclass the HTTPRedirectHandler to make it use our
1254 HeadRequest also on the redirected URL
1256 def redirect_request(self, req, fp, code, msg, headers, newurl):
1257 if code in (301, 302, 303, 307):
1258 newurl = newurl.replace(' ', '%20')
1259 newheaders = dict((k,v) for k,v in req.headers.items()
1260 if k.lower() not in ("content-length", "content-type"))
1261 return HeadRequest(newurl,
1263 origin_req_host=req.get_origin_req_host(),
1266 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1268 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1270 Fallback to GET if HEAD is not allowed (405 HTTP error)
1272 def http_error_405(self, req, fp, code, msg, headers):
1276 newheaders = dict((k,v) for k,v in req.headers.items()
1277 if k.lower() not in ("content-length", "content-type"))
1278 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1280 origin_req_host=req.get_origin_req_host(),
1284 opener = compat_urllib_request.OpenerDirector()
1285 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1286 HTTPMethodFallback, HEADRedirectHandler,
1287 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1288 opener.add_handler(handler())
1290 response = opener.open(HeadRequest(url))
1291 new_url = response.geturl()
1296 self.report_following_redirect(new_url)
1297 self._downloader.download([new_url])
1300 def _real_extract(self, url):
1301 if self._test_redirect(url): return
1303 video_id = url.split('/')[-1]
1304 request = compat_urllib_request.Request(url)
1306 self.report_download_webpage(video_id)
1307 webpage = compat_urllib_request.urlopen(request).read()
1308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1309 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1311 except ValueError as err:
1312 # since this is the last-resort InfoExtractor, if
1313 # this error is thrown, it'll be thrown here
1314 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1317 self.report_extraction(video_id)
1318 # Start with something easy: JW Player in SWFObject
1319 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1321 # Broaden the search a little bit
1322 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1324 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1327 # It's possible that one of the regexes
1328 # matched, but returned an empty group:
1329 if mobj.group(1) is None:
1330 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1333 video_url = compat_urllib_parse.unquote(mobj.group(1))
1334 video_id = os.path.basename(video_url)
1336 # here's a fun little line of code for you:
1337 video_extension = os.path.splitext(video_id)[1][1:]
1338 video_id = os.path.splitext(video_id)[0]
1340 # it's tempting to parse this further, but you would
1341 # have to take into account all the variations like
1342 # Video Title - Site Name
1343 # Site Name | Video Title
1344 # Video Title - Tagline | Site Name
1345 # and so on and so forth; it's just not practical
1346 mobj = re.search(r'<title>(.*)</title>', webpage)
1348 self._downloader.trouble(u'ERROR: unable to extract title')
1350 video_title = mobj.group(1)
1352 # video uploader is domain name
1353 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1355 self._downloader.trouble(u'ERROR: unable to extract title')
1357 video_uploader = mobj.group(1)
1362 'uploader': video_uploader,
1363 'upload_date': None,
1364 'title': video_title,
1365 'ext': video_extension,
1369 class YoutubeSearchIE(InfoExtractor):
1370 """Information Extractor for YouTube search queries."""
1371 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1372 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1373 _max_youtube_results = 1000
1374 IE_NAME = u'youtube:search'
1376 def __init__(self, downloader=None):
1377 InfoExtractor.__init__(self, downloader)
1379 def report_download_page(self, query, pagenum):
1380 """Report attempt to download search page with given number."""
1381 query = query.decode(preferredencoding())
1382 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1384 def _real_extract(self, query):
1385 mobj = re.match(self._VALID_URL, query)
1387 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1390 prefix, query = query.split(':')
1392 query = query.encode('utf-8')
1394 self._download_n_results(query, 1)
1396 elif prefix == 'all':
1397 self._download_n_results(query, self._max_youtube_results)
1403 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1405 elif n > self._max_youtube_results:
1406 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1407 n = self._max_youtube_results
1408 self._download_n_results(query, n)
1410 except ValueError: # parsing prefix as integer fails
1411 self._download_n_results(query, 1)
1414 def _download_n_results(self, query, n):
1415 """Downloads a specified number of results for a query"""
1421 while (50 * pagenum) < limit:
1422 self.report_download_page(query, pagenum+1)
1423 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1424 request = compat_urllib_request.Request(result_url)
1426 data = compat_urllib_request.urlopen(request).read()
1427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1430 api_response = json.loads(data)['data']
1432 new_ids = list(video['id'] for video in api_response['items'])
1433 video_ids += new_ids
1435 limit = min(n, api_response['totalItems'])
1438 if len(video_ids) > n:
1439 video_ids = video_ids[:n]
1440 for id in video_ids:
1441 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1445 class GoogleSearchIE(InfoExtractor):
1446 """Information Extractor for Google Video search queries."""
1447 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1448 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1449 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1450 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1451 _max_google_results = 1000
1452 IE_NAME = u'video.google:search'
1454 def __init__(self, downloader=None):
1455 InfoExtractor.__init__(self, downloader)
1457 def report_download_page(self, query, pagenum):
1458 """Report attempt to download playlist page with given number."""
1459 query = query.decode(preferredencoding())
1460 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1462 def _real_extract(self, query):
1463 mobj = re.match(self._VALID_URL, query)
1465 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1468 prefix, query = query.split(':')
1470 query = query.encode('utf-8')
1472 self._download_n_results(query, 1)
1474 elif prefix == 'all':
1475 self._download_n_results(query, self._max_google_results)
1481 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1483 elif n > self._max_google_results:
1484 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1485 n = self._max_google_results
1486 self._download_n_results(query, n)
1488 except ValueError: # parsing prefix as integer fails
1489 self._download_n_results(query, 1)
1492 def _download_n_results(self, query, n):
1493 """Downloads a specified number of results for a query"""
1499 self.report_download_page(query, pagenum)
1500 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1501 request = compat_urllib_request.Request(result_url)
1503 page = compat_urllib_request.urlopen(request).read()
1504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1505 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1508 # Extract video identifiers
1509 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1510 video_id = mobj.group(1)
1511 if video_id not in video_ids:
1512 video_ids.append(video_id)
1513 if len(video_ids) == n:
1514 # Specified n videos reached
1515 for id in video_ids:
1516 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1519 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1520 for id in video_ids:
1521 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1524 pagenum = pagenum + 1
1527 class YahooSearchIE(InfoExtractor):
1528 """Information Extractor for Yahoo! Video search queries."""
1531 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1532 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1533 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1534 _MORE_PAGES_INDICATOR = r'\s*Next'
1535 _max_yahoo_results = 1000
1536 IE_NAME = u'video.yahoo:search'
1538 def __init__(self, downloader=None):
1539 InfoExtractor.__init__(self, downloader)
1541 def report_download_page(self, query, pagenum):
1542 """Report attempt to download playlist page with given number."""
1543 query = query.decode(preferredencoding())
1544 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1546 def _real_extract(self, query):
1547 mobj = re.match(self._VALID_URL, query)
1549 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1552 prefix, query = query.split(':')
1554 query = query.encode('utf-8')
1556 self._download_n_results(query, 1)
1558 elif prefix == 'all':
1559 self._download_n_results(query, self._max_yahoo_results)
1565 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1567 elif n > self._max_yahoo_results:
1568 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1569 n = self._max_yahoo_results
1570 self._download_n_results(query, n)
1572 except ValueError: # parsing prefix as integer fails
1573 self._download_n_results(query, 1)
1576 def _download_n_results(self, query, n):
1577 """Downloads a specified number of results for a query"""
1580 already_seen = set()
1584 self.report_download_page(query, pagenum)
1585 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1586 request = compat_urllib_request.Request(result_url)
1588 page = compat_urllib_request.urlopen(request).read()
1589 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1590 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1593 # Extract video identifiers
1594 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1595 video_id = mobj.group(1)
1596 if video_id not in already_seen:
1597 video_ids.append(video_id)
1598 already_seen.add(video_id)
1599 if len(video_ids) == n:
1600 # Specified n videos reached
1601 for id in video_ids:
1602 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1605 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1606 for id in video_ids:
1607 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1610 pagenum = pagenum + 1
1613 class YoutubePlaylistIE(InfoExtractor):
1614 """Information Extractor for YouTube playlists."""
1616 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1617 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1618 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1619 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1620 IE_NAME = u'youtube:playlist'
1622 def __init__(self, downloader=None):
1623 InfoExtractor.__init__(self, downloader)
1625 def report_download_page(self, playlist_id, pagenum):
1626 """Report attempt to download playlist page with given number."""
1627 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1629 def _real_extract(self, url):
1630 # Extract playlist id
1631 mobj = re.match(self._VALID_URL, url)
1633 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1637 if mobj.group(3) is not None:
1638 self._downloader.download([mobj.group(3)])
1641 # Download playlist pages
1642 # prefix is 'p' as default for playlists but there are other types that need extra care
1643 playlist_prefix = mobj.group(1)
1644 if playlist_prefix == 'a':
1645 playlist_access = 'artist'
1647 playlist_prefix = 'p'
1648 playlist_access = 'view_play_list'
1649 playlist_id = mobj.group(2)
1654 self.report_download_page(playlist_id, pagenum)
1655 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1656 request = compat_urllib_request.Request(url)
1658 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1660 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1663 # Extract video identifiers
1665 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1666 if mobj.group(1) not in ids_in_page:
1667 ids_in_page.append(mobj.group(1))
1668 video_ids.extend(ids_in_page)
1670 if self._MORE_PAGES_INDICATOR not in page:
1672 pagenum = pagenum + 1
1674 total = len(video_ids)
1676 playliststart = self._downloader.params.get('playliststart', 1) - 1
1677 playlistend = self._downloader.params.get('playlistend', -1)
1678 if playlistend == -1:
1679 video_ids = video_ids[playliststart:]
1681 video_ids = video_ids[playliststart:playlistend]
1683 if len(video_ids) == total:
1684 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1686 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1688 for id in video_ids:
1689 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1693 class YoutubeChannelIE(InfoExtractor):
1694 """Information Extractor for YouTube channels."""
1696 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1697 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1698 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1699 IE_NAME = u'youtube:channel'
1701 def report_download_page(self, channel_id, pagenum):
1702 """Report attempt to download channel page with given number."""
1703 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1705 def _real_extract(self, url):
1706 # Extract channel id
1707 mobj = re.match(self._VALID_URL, url)
1709 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1712 # Download channel pages
1713 channel_id = mobj.group(1)
1718 self.report_download_page(channel_id, pagenum)
1719 url = self._TEMPLATE_URL % (channel_id, pagenum)
1720 request = compat_urllib_request.Request(url)
1722 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1727 # Extract video identifiers
1729 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1730 if mobj.group(1) not in ids_in_page:
1731 ids_in_page.append(mobj.group(1))
1732 video_ids.extend(ids_in_page)
1734 if self._MORE_PAGES_INDICATOR not in page:
1736 pagenum = pagenum + 1
1738 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1740 for id in video_ids:
1741 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1745 class YoutubeUserIE(InfoExtractor):
1746 """Information Extractor for YouTube users."""
1748 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1749 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1750 _GDATA_PAGE_SIZE = 50
1751 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1752 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1753 IE_NAME = u'youtube:user'
1755 def __init__(self, downloader=None):
1756 InfoExtractor.__init__(self, downloader)
1758 def report_download_page(self, username, start_index):
1759 """Report attempt to download user page."""
1760 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1761 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1763 def _real_extract(self, url):
1765 mobj = re.match(self._VALID_URL, url)
1767 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1770 username = mobj.group(1)
1772 # Download video ids using YouTube Data API. Result size per
1773 # query is limited (currently to 50 videos) so we need to query
1774 # page by page until there are no video ids - it means we got
1781 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1782 self.report_download_page(username, start_index)
1784 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1787 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1792 # Extract video identifiers
1795 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1796 if mobj.group(1) not in ids_in_page:
1797 ids_in_page.append(mobj.group(1))
1799 video_ids.extend(ids_in_page)
1801 # A little optimization - if current page is not
1802 # "full", ie. does not contain PAGE_SIZE video ids then
1803 # we can assume that this page is the last one - there
1804 # are no more ids on further pages - no need to query
1807 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1812 all_ids_count = len(video_ids)
1813 playliststart = self._downloader.params.get('playliststart', 1) - 1
1814 playlistend = self._downloader.params.get('playlistend', -1)
1816 if playlistend == -1:
1817 video_ids = video_ids[playliststart:]
1819 video_ids = video_ids[playliststart:playlistend]
1821 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1822 (username, all_ids_count, len(video_ids)))
1824 for video_id in video_ids:
1825 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1828 class BlipTVUserIE(InfoExtractor):
1829 """Information Extractor for blip.tv users."""
1831 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1833 IE_NAME = u'blip.tv:user'
1835 def __init__(self, downloader=None):
1836 InfoExtractor.__init__(self, downloader)
1838 def report_download_page(self, username, pagenum):
1839 """Report attempt to download user page."""
1840 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1841 (self.IE_NAME, username, pagenum))
1843 def _real_extract(self, url):
1845 mobj = re.match(self._VALID_URL, url)
1847 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1850 username = mobj.group(1)
1852 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1854 request = compat_urllib_request.Request(url)
1857 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1858 mobj = re.search(r'data-users-id="([^"]+)"', page)
1859 page_base = page_base % mobj.group(1)
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1865 # Download video ids using BlipTV Ajax calls. Result size per
1866 # query is limited (currently to 12 videos) so we need to query
1867 # page by page until there are no video ids - it means we got
1874 self.report_download_page(username, pagenum)
1876 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1879 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1880 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1881 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1884 # Extract video identifiers
1887 for mobj in re.finditer(r'href="/([^"]+)"', page):
1888 if mobj.group(1) not in ids_in_page:
1889 ids_in_page.append(unescapeHTML(mobj.group(1)))
1891 video_ids.extend(ids_in_page)
1893 # A little optimization - if current page is not
1894 # "full", ie. does not contain PAGE_SIZE video ids then
1895 # we can assume that this page is the last one - there
1896 # are no more ids on further pages - no need to query
1899 if len(ids_in_page) < self._PAGE_SIZE:
1904 all_ids_count = len(video_ids)
1905 playliststart = self._downloader.params.get('playliststart', 1) - 1
1906 playlistend = self._downloader.params.get('playlistend', -1)
1908 if playlistend == -1:
1909 video_ids = video_ids[playliststart:]
1911 video_ids = video_ids[playliststart:playlistend]
1913 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1914 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1916 for video_id in video_ids:
1917 self._downloader.download([u'http://blip.tv/'+video_id])
1920 class DepositFilesIE(InfoExtractor):
1921 """Information extractor for depositfiles.com"""
1923 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1925 def report_download_webpage(self, file_id):
1926 """Report webpage download."""
1927 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1929 def report_extraction(self, file_id):
1930 """Report information extraction."""
1931 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1933 def _real_extract(self, url):
1934 file_id = url.split('/')[-1]
1935 # Rebuild url in english locale
1936 url = 'http://depositfiles.com/en/files/' + file_id
1938 # Retrieve file webpage with 'Free download' button pressed
1939 free_download_indication = { 'gateway_result' : '1' }
1940 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1942 self.report_download_webpage(file_id)
1943 webpage = compat_urllib_request.urlopen(request).read()
1944 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1945 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1948 # Search for the real file URL
1949 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1950 if (mobj is None) or (mobj.group(1) is None):
1951 # Try to figure out reason of the error.
1952 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1953 if (mobj is not None) and (mobj.group(1) is not None):
1954 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1955 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1957 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1960 file_url = mobj.group(1)
1961 file_extension = os.path.splitext(file_url)[1][1:]
1963 # Search for file title
1964 mobj = re.search(r'<b title="(.*?)">', webpage)
1966 self._downloader.trouble(u'ERROR: unable to extract title')
1968 file_title = mobj.group(1).decode('utf-8')
1971 'id': file_id.decode('utf-8'),
1972 'url': file_url.decode('utf-8'),
1974 'upload_date': None,
1975 'title': file_title,
1976 'ext': file_extension.decode('utf-8'),
1980 class FacebookIE(InfoExtractor):
1981 """Information Extractor for Facebook"""
1983 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1984 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1985 _NETRC_MACHINE = 'facebook'
1986 IE_NAME = u'facebook'
1988 def report_login(self):
1989 """Report attempt to log in."""
1990 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
1992 def _real_initialize(self):
1993 if self._downloader is None:
1998 downloader_params = self._downloader.params
2000 # Attempt to use provided username and password or .netrc data
2001 if downloader_params.get('username', None) is not None:
2002 useremail = downloader_params['username']
2003 password = downloader_params['password']
2004 elif downloader_params.get('usenetrc', False):
2006 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2007 if info is not None:
2011 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2012 except (IOError, netrc.NetrcParseError) as err:
2013 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2016 if useremail is None:
2025 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2028 login_results = compat_urllib_request.urlopen(request).read()
2029 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2030 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2033 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2036 def _real_extract(self, url):
2037 mobj = re.match(self._VALID_URL, url)
2039 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2041 video_id = mobj.group('ID')
2043 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2044 webpage = self._download_webpage(url, video_id)
2046 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2047 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2048 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2050 raise ExtractorError(u'Cannot parse data')
2051 data = dict(json.loads(m.group(1)))
2052 video_url = compat_urllib_parse.unquote(data['hd_src'])
2053 video_duration = int(data['video_duration'])
2055 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2057 raise ExtractorError(u'Cannot find title in webpage')
2058 video_title = unescapeHTML(m.group(1))
2062 'title': video_title,
2065 'duration': video_duration,
2066 'thumbnail': data['thumbnail_src'],
2071 class BlipTVIE(InfoExtractor):
2072 """Information extractor for blip.tv"""
2074 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2075 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2076 IE_NAME = u'blip.tv'
2078 def report_extraction(self, file_id):
2079 """Report information extraction."""
2080 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2082 def report_direct_download(self, title):
2083 """Report information extraction."""
2084 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2086 def _real_extract(self, url):
2087 mobj = re.match(self._VALID_URL, url)
2089 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2096 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2097 request = compat_urllib_request.Request(json_url)
2098 request.add_header('User-Agent', 'iTunes/10.6.1')
2099 self.report_extraction(mobj.group(1))
2102 urlh = compat_urllib_request.urlopen(request)
2103 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2104 basename = url.split('/')[-1]
2105 title,ext = os.path.splitext(basename)
2106 title = title.decode('UTF-8')
2107 ext = ext.replace('.', '')
2108 self.report_direct_download(title)
2113 'upload_date': None,
2118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2119 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2120 if info is None: # Regular URL
2122 json_code_bytes = urlh.read()
2123 json_code = json_code_bytes.decode('utf-8')
2124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2125 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2129 json_data = json.loads(json_code)
2130 if 'Post' in json_data:
2131 data = json_data['Post']
2135 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2136 video_url = data['media']['url']
2137 umobj = re.match(self._URL_EXT, video_url)
2139 raise ValueError('Can not determine filename extension')
2140 ext = umobj.group(1)
2143 'id': data['item_id'],
2145 'uploader': data['display_name'],
2146 'upload_date': upload_date,
2147 'title': data['title'],
2149 'format': data['media']['mimeType'],
2150 'thumbnail': data['thumbnailUrl'],
2151 'description': data['description'],
2152 'player_url': data['embedUrl'],
2153 'user_agent': 'iTunes/10.6.1',
2155 except (ValueError,KeyError) as err:
2156 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2162 class MyVideoIE(InfoExtractor):
2163 """Information Extractor for myvideo.de."""
2165 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2166 IE_NAME = u'myvideo'
2168 def __init__(self, downloader=None):
2169 InfoExtractor.__init__(self, downloader)
2171 def report_extraction(self, video_id):
2172 """Report information extraction."""
2173 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2175 def _real_extract(self,url):
2176 mobj = re.match(self._VALID_URL, url)
2178 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2181 video_id = mobj.group(1)
2184 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2185 webpage = self._download_webpage(webpage_url, video_id)
2187 self.report_extraction(video_id)
2188 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2191 self._downloader.trouble(u'ERROR: unable to extract media URL')
2193 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2195 mobj = re.search('<title>([^<]+)</title>', webpage)
2197 self._downloader.trouble(u'ERROR: unable to extract title')
2200 video_title = mobj.group(1)
2206 'upload_date': None,
2207 'title': video_title,
2211 class ComedyCentralIE(InfoExtractor):
2212 """Information extractor for The Daily Show and Colbert Report """
2214 # urls can be abbreviations like :thedailyshow or :colbert
2215 # urls for episodes like:
2216 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2217 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2218 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2219 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2220 |(https?://)?(www\.)?
2221 (?P<showname>thedailyshow|colbertnation)\.com/
2222 (full-episodes/(?P<episode>.*)|
2224 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2225 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2228 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2230 _video_extensions = {
2238 _video_dimensions = {
2247 def suitable(self, url):
2248 """Receives a URL and returns True if suitable for this IE."""
2249 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2251 def report_extraction(self, episode_id):
2252 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2254 def report_config_download(self, episode_id, media_id):
2255 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2257 def report_index_download(self, episode_id):
2258 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2260 def _print_formats(self, formats):
2261 print('Available formats:')
2263 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2266 def _real_extract(self, url):
2267 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2272 if mobj.group('shortname'):
2273 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2274 url = u'http://www.thedailyshow.com/full-episodes/'
2276 url = u'http://www.colbertnation.com/full-episodes/'
2277 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2278 assert mobj is not None
2280 if mobj.group('clip'):
2281 if mobj.group('showname') == 'thedailyshow':
2282 epTitle = mobj.group('tdstitle')
2284 epTitle = mobj.group('cntitle')
2287 dlNewest = not mobj.group('episode')
2289 epTitle = mobj.group('showname')
2291 epTitle = mobj.group('episode')
2293 req = compat_urllib_request.Request(url)
2294 self.report_extraction(epTitle)
2296 htmlHandle = compat_urllib_request.urlopen(req)
2297 html = htmlHandle.read()
2298 webpage = html.decode('utf-8')
2299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2300 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2303 url = htmlHandle.geturl()
2304 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2306 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2308 if mobj.group('episode') == '':
2309 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2311 epTitle = mobj.group('episode')
2313 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2315 if len(mMovieParams) == 0:
2316 # The Colbert Report embeds the information in a without
2317 # a URL prefix; so extract the alternate reference
2318 # and then add the URL prefix manually.
2320 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2321 if len(altMovieParams) == 0:
2322 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2325 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2327 uri = mMovieParams[0][1]
2328 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2329 self.report_index_download(epTitle)
2331 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2333 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2338 idoc = xml.etree.ElementTree.fromstring(indexXml)
2339 itemEls = idoc.findall('.//item')
2340 for partNum,itemEl in enumerate(itemEls):
2341 mediaId = itemEl.findall('./guid')[0].text
2342 shortMediaId = mediaId.split(':')[-1]
2343 showId = mediaId.split(':')[-2].replace('.com', '')
2344 officialTitle = itemEl.findall('./title')[0].text
2345 officialDate = itemEl.findall('./pubDate')[0].text
2347 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2348 compat_urllib_parse.urlencode({'uri': mediaId}))
2349 configReq = compat_urllib_request.Request(configUrl)
2350 self.report_config_download(epTitle, shortMediaId)
2352 configXml = compat_urllib_request.urlopen(configReq).read()
2353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2357 cdoc = xml.etree.ElementTree.fromstring(configXml)
2359 for rendition in cdoc.findall('.//rendition'):
2360 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2364 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2367 if self._downloader.params.get('listformats', None):
2368 self._print_formats([i[0] for i in turls])
2371 # For now, just pick the highest bitrate
2372 format,rtmp_video_url = turls[-1]
2374 # Get the format arg from the arg stream
2375 req_format = self._downloader.params.get('format', None)
2377 # Select format if we can find one
2380 format, rtmp_video_url = f, v
2383 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2385 raise ExtractorError(u'Cannot transform RTMP url')
2386 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2387 video_url = base + m.group('finalid')
2389 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2394 'upload_date': officialDate,
2399 'description': officialTitle,
2401 results.append(info)
2406 class EscapistIE(InfoExtractor):
2407 """Information extractor for The Escapist """
2409 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2410 IE_NAME = u'escapist'
2412 def report_extraction(self, showName):
2413 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2415 def report_config_download(self, showName):
2416 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2418 def _real_extract(self, url):
2419 mobj = re.match(self._VALID_URL, url)
2421 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2423 showName = mobj.group('showname')
2424 videoId = mobj.group('episode')
2426 self.report_extraction(showName)
2428 webPage = compat_urllib_request.urlopen(url)
2429 webPageBytes = webPage.read()
2430 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2431 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2432 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2433 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2436 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2437 description = unescapeHTML(descMatch.group(1))
2438 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2439 imgUrl = unescapeHTML(imgMatch.group(1))
2440 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2441 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2442 configUrlMatch = re.search('config=(.*)$', playerUrl)
2443 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2445 self.report_config_download(showName)
2447 configJSON = compat_urllib_request.urlopen(configUrl)
2448 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2449 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2451 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2454 # Technically, it's JavaScript, not JSON
2455 configJSON = configJSON.replace("'", '"')
2458 config = json.loads(configJSON)
2459 except (ValueError,) as err:
2460 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2463 playlist = config['playlist']
2464 videoUrl = playlist[1]['url']
2469 'uploader': showName,
2470 'upload_date': None,
2473 'thumbnail': imgUrl,
2474 'description': description,
2475 'player_url': playerUrl,
2480 class CollegeHumorIE(InfoExtractor):
2481 """Information extractor for collegehumor.com"""
2484 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2485 IE_NAME = u'collegehumor'
2487 def report_manifest(self, video_id):
2488 """Report information extraction."""
2489 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2491 def report_extraction(self, video_id):
2492 """Report information extraction."""
2493 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2495 def _real_extract(self, url):
2496 mobj = re.match(self._VALID_URL, url)
2498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2500 video_id = mobj.group('videoid')
2505 'upload_date': None,
2508 self.report_extraction(video_id)
2509 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2511 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2512 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2513 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2516 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2518 videoNode = mdoc.findall('./video')[0]
2519 info['description'] = videoNode.findall('./description')[0].text
2520 info['title'] = videoNode.findall('./caption')[0].text
2521 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2522 manifest_url = videoNode.findall('./file')[0].text
2524 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2527 manifest_url += '?hdcore=2.10.3'
2528 self.report_manifest(video_id)
2530 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2535 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2537 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2538 node_id = media_node.attrib['url']
2539 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2540 except IndexError as err:
2541 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2544 url_pr = compat_urllib_parse_urlparse(manifest_url)
2545 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2552 class XVideosIE(InfoExtractor):
2553 """Information extractor for xvideos.com"""
2555 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2556 IE_NAME = u'xvideos'
2558 def report_extraction(self, video_id):
2559 """Report information extraction."""
2560 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2562 def _real_extract(self, url):
2563 mobj = re.match(self._VALID_URL, url)
2565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2567 video_id = mobj.group(1)
2569 webpage = self._download_webpage(url, video_id)
2571 self.report_extraction(video_id)
2575 mobj = re.search(r'flv_url=(.+?)&', webpage)
2577 self._downloader.trouble(u'ERROR: unable to extract video url')
2579 video_url = compat_urllib_parse.unquote(mobj.group(1))
2583 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2585 self._downloader.trouble(u'ERROR: unable to extract video title')
2587 video_title = mobj.group(1)
2590 # Extract video thumbnail
2591 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2593 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2595 video_thumbnail = mobj.group(0)
2601 'upload_date': None,
2602 'title': video_title,
2604 'thumbnail': video_thumbnail,
2605 'description': None,
2611 class SoundcloudIE(InfoExtractor):
2612 """Information extractor for soundcloud.com
2613 To access the media, the uid of the song and a stream token
2614 must be extracted from the page source and the script must make
2615 a request to media.soundcloud.com/crossdomain.xml. Then
2616 the media can be grabbed by requesting from an url composed
2617 of the stream token and uid
2620 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2621 IE_NAME = u'soundcloud'
2623 def __init__(self, downloader=None):
2624 InfoExtractor.__init__(self, downloader)
2626 def report_resolve(self, video_id):
2627 """Report information extraction."""
2628 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2630 def report_extraction(self, video_id):
2631 """Report information extraction."""
2632 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2634 def _real_extract(self, url):
2635 mobj = re.match(self._VALID_URL, url)
2637 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2640 # extract uploader (which is in the url)
2641 uploader = mobj.group(1)
2642 # extract simple title (uploader + slug of song title)
2643 slug_title = mobj.group(2)
2644 simple_title = uploader + u'-' + slug_title
2646 self.report_resolve('%s/%s' % (uploader, slug_title))
2648 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2649 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2650 request = compat_urllib_request.Request(resolv_url)
2652 info_json_bytes = compat_urllib_request.urlopen(request).read()
2653 info_json = info_json_bytes.decode('utf-8')
2654 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2655 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2658 info = json.loads(info_json)
2659 video_id = info['id']
2660 self.report_extraction('%s/%s' % (uploader, slug_title))
2662 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2663 request = compat_urllib_request.Request(streams_url)
2665 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2666 stream_json = stream_json_bytes.decode('utf-8')
2667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2671 streams = json.loads(stream_json)
2672 mediaURL = streams['http_mp3_128_url']
2677 'uploader': info['user']['username'],
2678 'upload_date': info['created_at'],
2679 'title': info['title'],
2681 'description': info['description'],
2685 class InfoQIE(InfoExtractor):
2686 """Information extractor for infoq.com"""
2687 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2689 def report_extraction(self, video_id):
2690 """Report information extraction."""
2691 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2693 def _real_extract(self, url):
2694 mobj = re.match(self._VALID_URL, url)
2696 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2699 webpage = self._download_webpage(url, video_id=url)
2700 self.report_extraction(url)
2703 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2705 self._downloader.trouble(u'ERROR: unable to extract video url')
2707 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2708 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2711 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2713 self._downloader.trouble(u'ERROR: unable to extract video title')
2715 video_title = mobj.group(1)
2717 # Extract description
2718 video_description = u'No description available.'
2719 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2720 if mobj is not None:
2721 video_description = mobj.group(1)
2723 video_filename = video_url.split('/')[-1]
2724 video_id, extension = video_filename.split('.')
2730 'upload_date': None,
2731 'title': video_title,
2732 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2734 'description': video_description,
2739 class MixcloudIE(InfoExtractor):
2740 """Information extractor for www.mixcloud.com"""
2742 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2743 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2744 IE_NAME = u'mixcloud'
2746 def __init__(self, downloader=None):
2747 InfoExtractor.__init__(self, downloader)
2749 def report_download_json(self, file_id):
2750 """Report JSON download."""
2751 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2753 def report_extraction(self, file_id):
2754 """Report information extraction."""
2755 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2757 def get_urls(self, jsonData, fmt, bitrate='best'):
2758 """Get urls from 'audio_formats' section in json"""
2761 bitrate_list = jsonData[fmt]
2762 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2763 bitrate = max(bitrate_list) # select highest
2765 url_list = jsonData[fmt][bitrate]
2766 except TypeError: # we have no bitrate info.
2767 url_list = jsonData[fmt]
2770 def check_urls(self, url_list):
2771 """Returns 1st active url from list"""
2772 for url in url_list:
2774 compat_urllib_request.urlopen(url)
2776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2781 def _print_formats(self, formats):
2782 print('Available formats:')
2783 for fmt in formats.keys():
2784 for b in formats[fmt]:
2786 ext = formats[fmt][b][0]
2787 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2788 except TypeError: # we have no bitrate info
2789 ext = formats[fmt][0]
2790 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2793 def _real_extract(self, url):
2794 mobj = re.match(self._VALID_URL, url)
2796 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2798 # extract uploader & filename from url
2799 uploader = mobj.group(1).decode('utf-8')
2800 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2802 # construct API request
2803 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2804 # retrieve .json file with links to files
2805 request = compat_urllib_request.Request(file_url)
2807 self.report_download_json(file_url)
2808 jsonData = compat_urllib_request.urlopen(request).read()
2809 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2810 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2814 json_data = json.loads(jsonData)
2815 player_url = json_data['player_swf_url']
2816 formats = dict(json_data['audio_formats'])
2818 req_format = self._downloader.params.get('format', None)
2821 if self._downloader.params.get('listformats', None):
2822 self._print_formats(formats)
2825 if req_format is None or req_format == 'best':
2826 for format_param in formats.keys():
2827 url_list = self.get_urls(formats, format_param)
2829 file_url = self.check_urls(url_list)
2830 if file_url is not None:
2833 if req_format not in formats:
2834 self._downloader.trouble(u'ERROR: format is not available')
2837 url_list = self.get_urls(formats, req_format)
2838 file_url = self.check_urls(url_list)
2839 format_param = req_format
2842 'id': file_id.decode('utf-8'),
2843 'url': file_url.decode('utf-8'),
2844 'uploader': uploader.decode('utf-8'),
2845 'upload_date': None,
2846 'title': json_data['name'],
2847 'ext': file_url.split('.')[-1].decode('utf-8'),
2848 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2849 'thumbnail': json_data['thumbnail_url'],
2850 'description': json_data['description'],
2851 'player_url': player_url.decode('utf-8'),
2854 class StanfordOpenClassroomIE(InfoExtractor):
2855 """Information extractor for Stanford's Open ClassRoom"""
2857 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2858 IE_NAME = u'stanfordoc'
2860 def report_download_webpage(self, objid):
2861 """Report information extraction."""
2862 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2864 def report_extraction(self, video_id):
2865 """Report information extraction."""
2866 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2868 def _real_extract(self, url):
2869 mobj = re.match(self._VALID_URL, url)
2871 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2874 if mobj.group('course') and mobj.group('video'): # A specific video
2875 course = mobj.group('course')
2876 video = mobj.group('video')
2878 'id': course + '_' + video,
2880 'upload_date': None,
2883 self.report_extraction(info['id'])
2884 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2885 xmlUrl = baseUrl + video + '.xml'
2887 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2888 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2889 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2891 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2893 info['title'] = mdoc.findall('./title')[0].text
2894 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2896 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2898 info['ext'] = info['url'].rpartition('.')[2]
2900 elif mobj.group('course'): # A course page
2901 course = mobj.group('course')
2906 'upload_date': None,
2909 self.report_download_webpage(info['id'])
2911 coursepage = compat_urllib_request.urlopen(url).read()
2912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2913 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2916 m = re.search('<h1>([^<]+)</h1>', coursepage)
2918 info['title'] = unescapeHTML(m.group(1))
2920 info['title'] = info['id']
2922 m = re.search('<description>([^<]+)</description>', coursepage)
2924 info['description'] = unescapeHTML(m.group(1))
2926 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2929 'type': 'reference',
2930 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2934 for entry in info['list']:
2935 assert entry['type'] == 'reference'
2936 results += self.extract(entry['url'])
2941 'id': 'Stanford OpenClassroom',
2944 'upload_date': None,
2947 self.report_download_webpage(info['id'])
2948 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2950 rootpage = compat_urllib_request.urlopen(rootURL).read()
2951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2952 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2955 info['title'] = info['id']
2957 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2960 'type': 'reference',
2961 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2966 for entry in info['list']:
2967 assert entry['type'] == 'reference'
2968 results += self.extract(entry['url'])
2971 class MTVIE(InfoExtractor):
2972 """Information extractor for MTV.com"""
2974 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2977 def report_extraction(self, video_id):
2978 """Report information extraction."""
2979 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2981 def _real_extract(self, url):
2982 mobj = re.match(self._VALID_URL, url)
2984 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2986 if not mobj.group('proto'):
2987 url = 'http://' + url
2988 video_id = mobj.group('videoid')
2990 webpage = self._download_webpage(url, video_id)
2992 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2994 self._downloader.trouble(u'ERROR: unable to extract song name')
2996 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2997 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2999 self._downloader.trouble(u'ERROR: unable to extract performer')
3001 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3002 video_title = performer + ' - ' + song_name
3004 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3006 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3008 mtvn_uri = mobj.group(1)
3010 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3012 self._downloader.trouble(u'ERROR: unable to extract content id')
3014 content_id = mobj.group(1)
3016 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3017 self.report_extraction(video_id)
3018 request = compat_urllib_request.Request(videogen_url)
3020 metadataXml = compat_urllib_request.urlopen(request).read()
3021 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3025 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3026 renditions = mdoc.findall('.//rendition')
3028 # For now, always pick the highest quality.
3029 rendition = renditions[-1]
3032 _,_,ext = rendition.attrib['type'].partition('/')
3033 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3034 video_url = rendition.find('./src').text
3036 self._downloader.trouble('Invalid rendition field.')
3042 'uploader': performer,
3043 'upload_date': None,
3044 'title': video_title,
3052 class YoukuIE(InfoExtractor):
3053 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3055 def report_download_webpage(self, file_id):
3056 """Report webpage download."""
3057 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3059 def report_extraction(self, file_id):
3060 """Report information extraction."""
3061 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3064 nowTime = int(time.time() * 1000)
3065 random1 = random.randint(1000,1998)
3066 random2 = random.randint(1000,9999)
3068 return "%d%d%d" %(nowTime,random1,random2)
3070 def _get_file_ID_mix_string(self, seed):
3072 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3074 for i in range(len(source)):
3075 seed = (seed * 211 + 30031 ) % 65536
3076 index = math.floor(seed / 65536 * len(source) )
3077 mixed.append(source[int(index)])
3078 source.remove(source[int(index)])
3079 #return ''.join(mixed)
3082 def _get_file_id(self, fileId, seed):
3083 mixed = self._get_file_ID_mix_string(seed)
3084 ids = fileId.split('*')
3088 realId.append(mixed[int(ch)])
3089 return ''.join(realId)
3091 def _real_extract(self, url):
3092 mobj = re.match(self._VALID_URL, url)
3094 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3096 video_id = mobj.group('ID')
3098 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3100 request = compat_urllib_request.Request(info_url, None, std_headers)
3102 self.report_download_webpage(video_id)
3103 jsondata = compat_urllib_request.urlopen(request).read()
3104 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3105 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3108 self.report_extraction(video_id)
3110 jsonstr = jsondata.decode('utf-8')
3111 config = json.loads(jsonstr)
3113 video_title = config['data'][0]['title']
3114 seed = config['data'][0]['seed']
3116 format = self._downloader.params.get('format', None)
3117 supported_format = list(config['data'][0]['streamfileids'].keys())
3119 if format is None or format == 'best':
3120 if 'hd2' in supported_format:
3125 elif format == 'worst':
3133 fileid = config['data'][0]['streamfileids'][format]
3134 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3135 except (UnicodeDecodeError, ValueError, KeyError):
3136 self._downloader.trouble(u'ERROR: unable to extract info section')
3140 sid = self._gen_sid()
3141 fileid = self._get_file_id(fileid, seed)
3143 #column 8,9 of fileid represent the segment number
3144 #fileid[7:9] should be changed
3145 for index, key in enumerate(keys):
3147 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3148 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3151 'id': '%s_part%02d' % (video_id, index),
3152 'url': download_url,
3154 'upload_date': None,
3155 'title': video_title,
3158 files_info.append(info)
3163 class XNXXIE(InfoExtractor):
3164 """Information extractor for xnxx.com"""
3166 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3168 VIDEO_URL_RE = r'flv_url=(.*?)&'
3169 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3170 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3172 def report_webpage(self, video_id):
3173 """Report information extraction"""
3174 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3176 def report_extraction(self, video_id):
3177 """Report information extraction"""
3178 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3180 def _real_extract(self, url):
3181 mobj = re.match(self._VALID_URL, url)
3183 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3185 video_id = mobj.group(1)
3187 self.report_webpage(video_id)
3189 # Get webpage content
3191 webpage_bytes = compat_urllib_request.urlopen(url).read()
3192 webpage = webpage_bytes.decode('utf-8')
3193 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3194 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3197 result = re.search(self.VIDEO_URL_RE, webpage)
3199 self._downloader.trouble(u'ERROR: unable to extract video url')
3201 video_url = compat_urllib_parse.unquote(result.group(1))
3203 result = re.search(self.VIDEO_TITLE_RE, webpage)
3205 self._downloader.trouble(u'ERROR: unable to extract video title')
3207 video_title = result.group(1)
3209 result = re.search(self.VIDEO_THUMB_RE, webpage)
3211 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3213 video_thumbnail = result.group(1)
3219 'upload_date': None,
3220 'title': video_title,
3222 'thumbnail': video_thumbnail,
3223 'description': None,
3227 class GooglePlusIE(InfoExtractor):
3228 """Information extractor for plus.google.com."""
3230 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3231 IE_NAME = u'plus.google'
3233 def __init__(self, downloader=None):
3234 InfoExtractor.__init__(self, downloader)
3236 def report_extract_entry(self, url):
3237 """Report downloading extry"""
3238 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3240 def report_date(self, upload_date):
3241 """Report downloading extry"""
3242 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3244 def report_uploader(self, uploader):
3245 """Report downloading extry"""
3246 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3248 def report_title(self, video_title):
3249 """Report downloading extry"""
3250 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3252 def report_extract_vid_page(self, video_page):
3253 """Report information extraction."""
3254 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3256 def _real_extract(self, url):
3257 # Extract id from URL
3258 mobj = re.match(self._VALID_URL, url)
3260 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3263 post_url = mobj.group(0)
3264 video_id = mobj.group(1)
3266 video_extension = 'flv'
3268 # Step 1, Retrieve post webpage to extract further information
3269 self.report_extract_entry(post_url)
3270 request = compat_urllib_request.Request(post_url)
3272 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3273 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3274 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3277 # Extract update date
3279 pattern = 'title="Timestamp">(.*?)</a>'
3280 mobj = re.search(pattern, webpage)
3282 upload_date = mobj.group(1)
3283 # Convert timestring to a format suitable for filename
3284 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3285 upload_date = upload_date.strftime('%Y%m%d')
3286 self.report_date(upload_date)
3290 pattern = r'rel\="author".*?>(.*?)</a>'
3291 mobj = re.search(pattern, webpage)
3293 uploader = mobj.group(1)
3294 self.report_uploader(uploader)
3297 # Get the first line for title
3299 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3300 mobj = re.search(pattern, webpage)
3302 video_title = mobj.group(1)
3303 self.report_title(video_title)
3305 # Step 2, Stimulate clicking the image box to launch video
3306 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3307 mobj = re.search(pattern, webpage)
3309 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3311 video_page = mobj.group(1)
3312 request = compat_urllib_request.Request(video_page)
3314 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3316 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3318 self.report_extract_vid_page(video_page)
3321 # Extract video links on video page
3322 """Extract video links of all sizes"""
3323 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3324 mobj = re.findall(pattern, webpage)
3326 self._downloader.trouble(u'ERROR: unable to extract video links')
3328 # Sort in resolution
3329 links = sorted(mobj)
3331 # Choose the lowest of the sort, i.e. highest resolution
3332 video_url = links[-1]
3333 # Only get the url. The resolution part in the tuple has no use anymore
3334 video_url = video_url[-1]
3335 # Treat escaped \u0026 style hex
3337 video_url = video_url.decode("unicode_escape")
3338 except AttributeError: # Python 3
3339 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3345 'uploader': uploader,
3346 'upload_date': upload_date,
3347 'title': video_title,
3348 'ext': video_extension,
3351 class NBAIE(InfoExtractor):
3352 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3355 def _real_extract(self, url):
3356 mobj = re.match(self._VALID_URL, url)
3358 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3361 video_id = mobj.group(1)
3362 if video_id.endswith('/index.html'):
3363 video_id = video_id[:-len('/index.html')]
3365 webpage = self._download_webpage(url, video_id)
3367 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3368 def _findProp(rexp, default=None):
3369 m = re.search(rexp, webpage)
3371 return unescapeHTML(m.group(1))
3375 shortened_video_id = video_id.rpartition('/')[2]
3376 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3378 'id': shortened_video_id,
3382 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3383 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3387 class JustinTVIE(InfoExtractor):
3388 """Information extractor for justin.tv and twitch.tv"""
3389 # TODO: One broadcast may be split into multiple videos. The key
3390 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3391 # starts at 1 and increases. Can we treat all parts as one video?
3393 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3394 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3395 _JUSTIN_PAGE_LIMIT = 100
3396 IE_NAME = u'justin.tv'
3398 def report_extraction(self, file_id):
3399 """Report information extraction."""
3400 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3402 def report_download_page(self, channel, offset):
3403 """Report attempt to download a single page of videos."""
3404 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3405 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3407 # Return count of items, list of *valid* items
3408 def _parse_page(self, url):
3410 urlh = compat_urllib_request.urlopen(url)
3411 webpage_bytes = urlh.read()
3412 webpage = webpage_bytes.decode('utf-8', 'ignore')
3413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3414 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3417 response = json.loads(webpage)
3418 if type(response) != list:
3419 error_text = response.get('error', 'unknown error')
3420 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3423 for clip in response:
3424 video_url = clip['video_file_url']
3426 video_extension = os.path.splitext(video_url)[1][1:]
3427 video_date = re.sub('-', '', clip['start_time'][:10])
3428 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3429 video_id = clip['id']
3430 video_title = clip.get('title', video_id)
3434 'title': video_title,
3435 'uploader': clip.get('channel_name', video_uploader_id),
3436 'uploader_id': video_uploader_id,
3437 'upload_date': video_date,
3438 'ext': video_extension,
3440 return (len(response), info)
3442 def _real_extract(self, url):
3443 mobj = re.match(self._VALID_URL, url)
3445 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3448 api = 'http://api.justin.tv'
3449 video_id = mobj.group(mobj.lastindex)
3451 if mobj.lastindex == 1:
3453 api += '/channel/archives/%s.json'
3455 api += '/broadcast/by_archive/%s.json'
3456 api = api % (video_id,)
3458 self.report_extraction(video_id)
3462 limit = self._JUSTIN_PAGE_LIMIT
3465 self.report_download_page(video_id, offset)
3466 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3467 page_count, page_info = self._parse_page(page_url)
3468 info.extend(page_info)
3469 if not paged or page_count != limit:
3474 class FunnyOrDieIE(InfoExtractor):
3475 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3477 def _real_extract(self, url):
3478 mobj = re.match(self._VALID_URL, url)
3480 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3483 video_id = mobj.group('id')
3484 webpage = self._download_webpage(url, video_id)
3486 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3488 self._downloader.trouble(u'ERROR: unable to find video information')
3489 video_url = unescapeHTML(m.group('url'))
3491 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3493 self._downloader.trouble(u'Cannot find video title')
3494 title = unescapeHTML(m.group('title'))
3496 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3498 desc = unescapeHTML(m.group('desc'))
3507 'description': desc,
3511 class TweetReelIE(InfoExtractor):
3512 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3514 def _real_extract(self, url):
3515 mobj = re.match(self._VALID_URL, url)
3517 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3520 video_id = mobj.group('id')
3521 webpage = self._download_webpage(url, video_id)
3523 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3525 self._downloader.trouble(u'ERROR: Cannot find status ID')
3526 status_id = m.group(1)
3528 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3530 self._downloader.trouble(u'WARNING: Cannot find description')
3531 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3533 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3535 self._downloader.trouble(u'ERROR: Cannot find uploader')
3536 uploader = unescapeHTML(m.group('uploader'))
3537 uploader_id = unescapeHTML(m.group('uploader_id'))
3539 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3541 self._downloader.trouble(u'ERROR: Cannot find upload date')
3542 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3545 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3552 'description': desc,
3553 'uploader': uploader,
3554 'uploader_id': uploader_id,
3555 'internal_id': status_id,
3556 'upload_date': upload_date
3560 class SteamIE(InfoExtractor):
3561 _VALID_URL = r"""http://store.steampowered.com/
3562 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3564 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3567 def suitable(self, url):
3568 """Receives a URL and returns True if suitable for this IE."""
3569 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3571 def _real_extract(self, url):
3572 m = re.match(self._VALID_URL, url, re.VERBOSE)
3573 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3574 gameID = m.group('gameID')
3575 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3576 webpage = self._download_webpage(videourl, gameID)
3577 mweb = re.finditer(urlRE, webpage)
3578 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3579 titles = re.finditer(namesRE, webpage)
3581 for vid,vtitle in zip(mweb,titles):
3582 video_id = vid.group('videoID')
3583 title = vtitle.group('videoName')
3584 video_url = vid.group('videoURL')
3586 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3591 'title': unescapeHTML(title)
3596 class UstreamIE(InfoExtractor):
3597 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3598 IE_NAME = u'ustream'
3600 def _real_extract(self, url):
3601 m = re.match(self._VALID_URL, url)
3602 video_id = m.group('videoID')
3603 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3604 webpage = self._download_webpage(url, video_id)
3605 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3606 title = m.group('title')
3607 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3608 uploader = m.group('uploader')
3614 'uploader': uploader
3618 class RBMARadioIE(InfoExtractor):
3619 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3621 def _real_extract(self, url):
3622 m = re.match(self._VALID_URL, url)
3623 video_id = m.group('videoID')
3625 webpage = self._download_webpage(url, video_id)
3626 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3628 raise ExtractorError(u'Cannot find metadata')
3629 json_data = m.group(1)
3632 data = json.loads(json_data)
3633 except ValueError as e:
3634 raise ExtractorError(u'Invalid JSON: ' + str(e))
3636 video_url = data['akamai_url'] + '&cbr=256'
3637 url_parts = compat_urllib_parse_urlparse(video_url)
3638 video_ext = url_parts.path.rpartition('.')[2]
3643 'title': data['title'],
3644 'description': data.get('teaser_text'),
3645 'location': data.get('country_of_origin'),
3646 'uploader': data.get('host', {}).get('name'),
3647 'uploader_id': data.get('host', {}).get('slug'),
3648 'thumbnail': data.get('image', {}).get('large_url_2x'),
3649 'duration': data.get('duration'),
3654 class YouPornIE(InfoExtractor):
3655 """Information extractor for youporn.com."""
3656 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3658 def _print_formats(self, formats):
3659 """Print all available formats"""
3660 print(u'Available formats:')
3661 print(u'ext\t\tformat')
3662 print(u'---------------------------------')
3663 for format in formats:
3664 print(u'%s\t\t%s' % (format['ext'], format['format']))
3666 def _specific(self, req_format, formats):
3668 if(x["format"]==req_format):
3672 def _real_extract(self, url):
3673 mobj = re.match(self._VALID_URL, url)
3675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3678 video_id = mobj.group('videoid')
3680 req = compat_urllib_request.Request(url)
3681 req.add_header('Cookie', 'age_verified=1')
3682 webpage = self._download_webpage(req, video_id)
3684 # Get the video title
3685 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3687 raise ExtractorError(u'ERROR: unable to extract video title')
3688 video_title = result.group('title').strip()
3690 # Get the video date
3691 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3693 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3696 upload_date = result.group('date').strip()
3698 # Get the video uploader
3699 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3701 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3702 video_uploader = None
3704 video_uploader = result.group('uploader').strip()
3705 video_uploader = clean_html( video_uploader )
3707 # Get all of the formats available
3708 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3709 result = re.search(DOWNLOAD_LIST_RE, webpage)
3711 raise ExtractorError(u'Unable to extract download list')
3712 download_list_html = result.group('download_list').strip()
3714 # Get all of the links from the page
3715 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3716 links = re.findall(LINK_RE, download_list_html)
3717 if(len(links) == 0):
3718 raise ExtractorError(u'ERROR: no known formats available for video')
3720 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3725 # A link looks like this:
3726 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3727 # A path looks like this:
3728 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3729 video_url = unescapeHTML( link )
3730 path = compat_urllib_parse_urlparse( video_url ).path
3731 extension = os.path.splitext( path )[1][1:]
3732 format = path.split('/')[4].split('_')[:2]
3735 format = "-".join( format )
3736 title = u'%s-%s-%s' % (video_title, size, bitrate)
3741 'uploader': video_uploader,
3742 'upload_date': upload_date,
3747 'description': None,
3751 if self._downloader.params.get('listformats', None):
3752 self._print_formats(formats)
3755 req_format = self._downloader.params.get('format', None)
3756 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3758 if req_format is None or req_format == 'best':
3760 elif req_format == 'worst':
3761 return [formats[-1]]
3762 elif req_format in ('-1', 'all'):
3765 format = self._specific( req_format, formats )
3767 self._downloader.trouble(u'ERROR: requested format not available')
3773 class PornotubeIE(InfoExtractor):
3774 """Information extractor for pornotube.com."""
3775 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3777 def _real_extract(self, url):
3778 mobj = re.match(self._VALID_URL, url)
3780 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3783 video_id = mobj.group('videoid')
3784 video_title = mobj.group('title')
3786 # Get webpage content
3787 webpage = self._download_webpage(url, video_id)
3790 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3791 result = re.search(VIDEO_URL_RE, webpage)
3793 self._downloader.trouble(u'ERROR: unable to extract video url')
3795 video_url = compat_urllib_parse.unquote(result.group('url'))
3797 #Get the uploaded date
3798 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3799 result = re.search(VIDEO_UPLOADED_RE, webpage)
3801 self._downloader.trouble(u'ERROR: unable to extract video title')
3803 upload_date = result.group('date')
3805 info = {'id': video_id,
3808 'upload_date': upload_date,
3809 'title': video_title,
3817 class YouJizzIE(InfoExtractor):
3818 """Information extractor for youjizz.com."""
3819 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3821 def _real_extract(self, url):
3822 mobj = re.match(self._VALID_URL, url)
3824 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3827 video_id = mobj.group('videoid')
3829 # Get webpage content
3830 webpage = self._download_webpage(url, video_id)
3832 # Get the video title
3833 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3835 raise ExtractorError(u'ERROR: unable to extract video title')
3836 video_title = result.group('title').strip()
3838 # Get the embed page
3839 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3841 raise ExtractorError(u'ERROR: unable to extract embed page')
3843 embed_page_url = result.group(0).strip()
3844 video_id = result.group('videoid')
3846 webpage = self._download_webpage(embed_page_url, video_id)
3849 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3851 raise ExtractorError(u'ERROR: unable to extract video url')
3852 video_url = result.group('source')
3854 info = {'id': video_id,
3856 'title': video_title,
3859 'player_url': embed_page_url}
3864 def gen_extractors():
3865 """ Return a list of an instance of every supported extractor.
3866 The order does matter; the first extractor matched is the one handling the URL.
3869 YoutubePlaylistIE(),
3893 StanfordOpenClassroomIE(),