2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The subtitle file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
219 def report_video_subtitles_request(self, video_id, sub_lang, format):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
223 def report_video_subtitles_available(self, video_id, sub_lang_list):
224 """Report available subtitles."""
225 sub_lang = ",".join(list(sub_lang_list.keys()))
226 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
228 def report_information_extraction(self, video_id):
229 """Report attempt to extract video information."""
230 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
232 def report_unavailable_format(self, video_id, format):
233 """Report extracted video URL."""
234 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
236 def report_rtmp_download(self):
237 """Indicate the download will use the RTMP protocol."""
238 self._downloader.to_screen(u'[youtube] RTMP download detected')
240 def _get_available_subtitles(self, video_id):
241 self.report_video_subtitles_download(video_id)
242 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
244 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
245 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
246 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
247 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
248 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
249 if not sub_lang_list:
250 return (u'WARNING: video doesn\'t have subtitles', None)
253 def _list_available_subtitles(self, video_id):
254 sub_lang_list = self._get_available_subtitles(video_id)
255 self.report_video_subtitles_available(video_id, sub_lang_list)
257 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
258 self.report_video_subtitles_request(video_id, sub_lang, format)
259 params = compat_urllib_parse.urlencode({
265 url = 'http://www.youtube.com/api/timedtext?' + params
267 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
268 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
269 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
271 return (u'WARNING: Did not fetch video subtitles', None)
272 return (None, sub_lang, sub)
274 def _extract_subtitle(self, video_id):
275 sub_lang_list = self._get_available_subtitles(video_id)
276 sub_format = self._downloader.params.get('subtitlesformat')
277 if self._downloader.params.get('subtitleslang', False):
278 sub_lang = self._downloader.params.get('subtitleslang')
279 elif 'en' in sub_lang_list:
282 sub_lang = list(sub_lang_list.keys())[0]
283 if not sub_lang in sub_lang_list:
284 return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
286 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
289 def _extract_all_subtitles(self, video_id):
290 sub_lang_list = self._get_available_subtitles(video_id)
291 sub_format = self._downloader.params.get('subtitlesformat')
293 for sub_lang in sub_lang_list:
294 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
295 subtitles.append(subtitle)
298 def _print_formats(self, formats):
299 print('Available formats:')
301 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
303 def _real_initialize(self):
304 if self._downloader is None:
309 downloader_params = self._downloader.params
311 # Attempt to use provided username and password or .netrc data
312 if downloader_params.get('username', None) is not None:
313 username = downloader_params['username']
314 password = downloader_params['password']
315 elif downloader_params.get('usenetrc', False):
317 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
322 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
323 except (IOError, netrc.NetrcParseError) as err:
324 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
328 request = compat_urllib_request.Request(self._LANG_URL)
331 compat_urllib_request.urlopen(request).read()
332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
333 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
336 # No authentication to be performed
340 request = compat_urllib_request.Request(self._LOGIN_URL)
342 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
349 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
351 galx = match.group(1)
353 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
359 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
363 u'PersistentCookie': u'yes',
365 u'bgresponse': u'js_disabled',
366 u'checkConnection': u'',
367 u'checkedDomains': u'youtube',
373 u'signIn': u'Sign in',
375 u'service': u'youtube',
379 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
381 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
382 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
383 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
386 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
388 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
397 'action_confirm': 'Confirm',
399 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
401 self.report_age_confirmation()
402 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
407 def _extract_id(self, url):
408 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
410 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
412 video_id = mobj.group(2)
415 def _real_extract(self, url):
416 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
417 mobj = re.search(self._NEXT_URL_RE, url)
419 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
420 video_id = self._extract_id(url)
423 self.report_video_webpage_download(video_id)
424 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
425 request = compat_urllib_request.Request(url)
427 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
428 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
429 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
432 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
434 # Attempt to extract SWF player URL
435 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
437 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
442 self.report_video_info_webpage_download(video_id)
443 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
444 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
445 % (video_id, el_type))
446 request = compat_urllib_request.Request(video_info_url)
448 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
449 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
450 video_info = compat_parse_qs(video_info_webpage)
451 if 'token' in video_info:
453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
454 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
456 if 'token' not in video_info:
457 if 'reason' in video_info:
458 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
460 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
463 # Check for "rental" videos
464 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
465 self._downloader.trouble(u'ERROR: "rental" videos not supported')
468 # Start extracting information
469 self.report_information_extraction(video_id)
472 if 'author' not in video_info:
473 self._downloader.trouble(u'ERROR: unable to extract uploader name')
475 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
478 video_uploader_id = None
479 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
481 video_uploader_id = mobj.group(1)
483 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
486 if 'title' not in video_info:
487 self._downloader.trouble(u'ERROR: unable to extract video title')
489 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
492 if 'thumbnail_url' not in video_info:
493 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
495 else: # don't panic if we can't find it
496 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
500 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
502 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
503 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
504 for expression in format_expressions:
506 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
511 video_description = get_element_by_id("eow-description", video_webpage)
512 if video_description:
513 video_description = clean_html(video_description)
515 video_description = ''
518 video_subtitles = None
520 if self._downloader.params.get('writesubtitles', False):
521 video_subtitles = self._extract_subtitle(video_id)
523 (sub_error, sub_lang, sub) = video_subtitles[0]
525 self._downloader.trouble(sub_error)
527 if self._downloader.params.get('allsubtitles', False):
528 video_subtitles = self._extract_all_subtitles(video_id)
529 for video_subtitle in video_subtitles:
530 (sub_error, sub_lang, sub) = video_subtitle
532 self._downloader.trouble(sub_error)
534 if self._downloader.params.get('listsubtitles', False):
535 sub_lang_list = self._list_available_subtitles(video_id)
538 if 'length_seconds' not in video_info:
539 self._downloader.trouble(u'WARNING: unable to extract video duration')
542 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
545 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
547 # Decide which formats to download
548 req_format = self._downloader.params.get('format', None)
550 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
551 self.report_rtmp_download()
552 video_url_list = [(None, video_info['conn'][0])]
553 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
554 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
555 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
556 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
557 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
559 format_limit = self._downloader.params.get('format_limit', None)
560 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
561 if format_limit is not None and format_limit in available_formats:
562 format_list = available_formats[available_formats.index(format_limit):]
564 format_list = available_formats
565 existing_formats = [x for x in format_list if x in url_map]
566 if len(existing_formats) == 0:
567 self._downloader.trouble(u'ERROR: no known formats available for video')
569 if self._downloader.params.get('listformats', None):
570 self._print_formats(existing_formats)
572 if req_format is None or req_format == 'best':
573 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
574 elif req_format == 'worst':
575 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
576 elif req_format in ('-1', 'all'):
577 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
579 # Specific formats. We pick the first in a slash-delimeted sequence.
580 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
581 req_formats = req_format.split('/')
582 video_url_list = None
583 for rf in req_formats:
585 video_url_list = [(rf, url_map[rf])]
587 if video_url_list is None:
588 self._downloader.trouble(u'ERROR: requested format not available')
591 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
595 for format_param, video_real_url in video_url_list:
597 video_extension = self._video_extensions.get(format_param, 'flv')
599 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
600 self._video_dimensions.get(format_param, '???'))
604 'url': video_real_url,
605 'uploader': video_uploader,
606 'uploader_id': video_uploader_id,
607 'upload_date': upload_date,
608 'title': video_title,
609 'ext': video_extension,
610 'format': video_format,
611 'thumbnail': video_thumbnail,
612 'description': video_description,
613 'player_url': player_url,
614 'subtitles': video_subtitles,
615 'duration': video_duration
620 class MetacafeIE(InfoExtractor):
621 """Information Extractor for metacafe.com."""
623 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
624 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
625 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
626 IE_NAME = u'metacafe'
628 def __init__(self, downloader=None):
629 InfoExtractor.__init__(self, downloader)
631 def report_disclaimer(self):
632 """Report disclaimer retrieval."""
633 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
635 def report_age_confirmation(self):
636 """Report attempt to confirm age."""
637 self._downloader.to_screen(u'[metacafe] Confirming age')
639 def report_download_webpage(self, video_id):
640 """Report webpage download."""
641 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
643 def report_extraction(self, video_id):
644 """Report information extraction."""
645 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
647 def _real_initialize(self):
648 # Retrieve disclaimer
649 request = compat_urllib_request.Request(self._DISCLAIMER)
651 self.report_disclaimer()
652 disclaimer = compat_urllib_request.urlopen(request).read()
653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
654 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
660 'submit': "Continue - I'm over 18",
662 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
664 self.report_age_confirmation()
665 disclaimer = compat_urllib_request.urlopen(request).read()
666 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
667 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
670 def _real_extract(self, url):
671 # Extract id and simplified title from URL
672 mobj = re.match(self._VALID_URL, url)
674 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
677 video_id = mobj.group(1)
679 # Check if video comes from YouTube
680 mobj2 = re.match(r'^yt-(.*)$', video_id)
681 if mobj2 is not None:
682 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
685 # Retrieve video webpage to extract further information
686 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
688 self.report_download_webpage(video_id)
689 webpage = compat_urllib_request.urlopen(request).read()
690 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
691 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
694 # Extract URL, uploader and title from webpage
695 self.report_extraction(video_id)
696 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
698 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
699 video_extension = mediaURL[-3:]
701 # Extract gdaKey if available
702 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
706 gdaKey = mobj.group(1)
707 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
709 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
711 self._downloader.trouble(u'ERROR: unable to extract media URL')
713 vardict = compat_parse_qs(mobj.group(1))
714 if 'mediaData' not in vardict:
715 self._downloader.trouble(u'ERROR: unable to extract media URL')
717 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
719 self._downloader.trouble(u'ERROR: unable to extract media URL')
721 mediaURL = mobj.group(1).replace('\\/', '/')
722 video_extension = mediaURL[-3:]
723 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
725 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
727 self._downloader.trouble(u'ERROR: unable to extract title')
729 video_title = mobj.group(1).decode('utf-8')
731 mobj = re.search(r'submitter=(.*?);', webpage)
733 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
735 video_uploader = mobj.group(1)
738 'id': video_id.decode('utf-8'),
739 'url': video_url.decode('utf-8'),
740 'uploader': video_uploader.decode('utf-8'),
742 'title': video_title,
743 'ext': video_extension.decode('utf-8'),
747 class DailymotionIE(InfoExtractor):
748 """Information Extractor for Dailymotion"""
750 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
751 IE_NAME = u'dailymotion'
754 def __init__(self, downloader=None):
755 InfoExtractor.__init__(self, downloader)
757 def report_extraction(self, video_id):
758 """Report information extraction."""
759 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
761 def _real_extract(self, url):
762 # Extract id and simplified title from URL
763 mobj = re.match(self._VALID_URL, url)
765 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
768 video_id = mobj.group(1).split('_')[0].split('?')[0]
770 video_extension = 'mp4'
772 # Retrieve video webpage to extract further information
773 request = compat_urllib_request.Request(url)
774 request.add_header('Cookie', 'family_filter=off')
775 webpage = self._download_webpage(request, video_id)
777 # Extract URL, uploader and title from webpage
778 self.report_extraction(video_id)
779 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
781 self._downloader.trouble(u'ERROR: unable to extract media URL')
783 flashvars = compat_urllib_parse.unquote(mobj.group(1))
785 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
788 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
791 self._downloader.trouble(u'ERROR: unable to extract video URL')
794 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
796 self._downloader.trouble(u'ERROR: unable to extract video URL')
799 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
801 # TODO: support choosing qualities
803 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
805 self._downloader.trouble(u'ERROR: unable to extract title')
807 video_title = unescapeHTML(mobj.group('title'))
809 video_uploader = None
810 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
812 # lookin for official user
813 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
814 if mobj_official is None:
815 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
817 video_uploader = mobj_official.group(1)
819 video_uploader = mobj.group(1)
821 video_upload_date = None
822 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
824 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
829 'uploader': video_uploader,
830 'upload_date': video_upload_date,
831 'title': video_title,
832 'ext': video_extension,
836 class PhotobucketIE(InfoExtractor):
837 """Information extractor for photobucket.com."""
839 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
840 IE_NAME = u'photobucket'
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
853 def _real_extract(self, url):
854 # Extract id from URL
855 mobj = re.match(self._VALID_URL, url)
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
860 video_id = mobj.group(1)
862 video_extension = 'flv'
864 # Retrieve video webpage to extract further information
865 request = compat_urllib_request.Request(url)
867 self.report_download_webpage(video_id)
868 webpage = compat_urllib_request.urlopen(request).read()
869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
875 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
877 self._downloader.trouble(u'ERROR: unable to extract media URL')
879 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
883 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
885 self._downloader.trouble(u'ERROR: unable to extract title')
887 video_title = mobj.group(1).decode('utf-8')
889 video_uploader = mobj.group(2).decode('utf-8')
892 'id': video_id.decode('utf-8'),
893 'url': video_url.decode('utf-8'),
894 'uploader': video_uploader,
896 'title': video_title,
897 'ext': video_extension.decode('utf-8'),
901 class YahooIE(InfoExtractor):
902 """Information extractor for video.yahoo.com."""
905 # _VALID_URL matches all Yahoo! Video URLs
906 # _VPAGE_URL matches only the extractable '/watch/' URLs
907 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
908 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
909 IE_NAME = u'video.yahoo'
911 def __init__(self, downloader=None):
912 InfoExtractor.__init__(self, downloader)
914 def report_download_webpage(self, video_id):
915 """Report webpage download."""
916 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
918 def report_extraction(self, video_id):
919 """Report information extraction."""
920 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
922 def _real_extract(self, url, new_video=True):
923 # Extract ID from URL
924 mobj = re.match(self._VALID_URL, url)
926 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
929 video_id = mobj.group(2)
930 video_extension = 'flv'
932 # Rewrite valid but non-extractable URLs as
933 # extractable English language /watch/ URLs
934 if re.match(self._VPAGE_URL, url) is None:
935 request = compat_urllib_request.Request(url)
937 webpage = compat_urllib_request.urlopen(request).read()
938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
939 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
942 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
944 self._downloader.trouble(u'ERROR: Unable to extract id field')
946 yahoo_id = mobj.group(1)
948 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
950 self._downloader.trouble(u'ERROR: Unable to extract vid field')
952 yahoo_vid = mobj.group(1)
954 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
955 return self._real_extract(url, new_video=False)
957 # Retrieve video webpage to extract further information
958 request = compat_urllib_request.Request(url)
960 self.report_download_webpage(video_id)
961 webpage = compat_urllib_request.urlopen(request).read()
962 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
963 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
966 # Extract uploader and title from webpage
967 self.report_extraction(video_id)
968 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
970 self._downloader.trouble(u'ERROR: unable to extract video title')
972 video_title = mobj.group(1).decode('utf-8')
974 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
976 self._downloader.trouble(u'ERROR: unable to extract video uploader')
978 video_uploader = mobj.group(1).decode('utf-8')
980 # Extract video thumbnail
981 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
983 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
985 video_thumbnail = mobj.group(1).decode('utf-8')
987 # Extract video description
988 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
990 self._downloader.trouble(u'ERROR: unable to extract video description')
992 video_description = mobj.group(1).decode('utf-8')
993 if not video_description:
994 video_description = 'No description available.'
996 # Extract video height and width
997 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
999 self._downloader.trouble(u'ERROR: unable to extract video height')
1001 yv_video_height = mobj.group(1)
1003 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1005 self._downloader.trouble(u'ERROR: unable to extract video width')
1007 yv_video_width = mobj.group(1)
1009 # Retrieve video playlist to extract media URL
1010 # I'm not completely sure what all these options are, but we
1011 # seem to need most of them, otherwise the server sends a 401.
1012 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1013 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1014 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1015 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1016 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1018 self.report_download_webpage(video_id)
1019 webpage = compat_urllib_request.urlopen(request).read()
1020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1021 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1024 # Extract media URL from playlist XML
1025 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1027 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1029 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1030 video_url = unescapeHTML(video_url)
1033 'id': video_id.decode('utf-8'),
1035 'uploader': video_uploader,
1036 'upload_date': None,
1037 'title': video_title,
1038 'ext': video_extension.decode('utf-8'),
1039 'thumbnail': video_thumbnail.decode('utf-8'),
1040 'description': video_description,
1044 class VimeoIE(InfoExtractor):
1045 """Information extractor for vimeo.com."""
1047 # _VALID_URL matches Vimeo URLs
1048 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1051 def __init__(self, downloader=None):
1052 InfoExtractor.__init__(self, downloader)
1054 def report_download_webpage(self, video_id):
1055 """Report webpage download."""
1056 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1058 def report_extraction(self, video_id):
1059 """Report information extraction."""
1060 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1062 def _real_extract(self, url, new_video=True):
1063 # Extract ID from URL
1064 mobj = re.match(self._VALID_URL, url)
1066 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1069 video_id = mobj.group('id')
1070 if not mobj.group('proto'):
1071 url = 'https://' + url
1072 if mobj.group('direct_link'):
1073 url = 'https://vimeo.com/' + video_id
1075 # Retrieve video webpage to extract further information
1076 request = compat_urllib_request.Request(url, None, std_headers)
1078 self.report_download_webpage(video_id)
1079 webpage_bytes = compat_urllib_request.urlopen(request).read()
1080 webpage = webpage_bytes.decode('utf-8')
1081 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1082 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1085 # Now we begin extracting as much information as we can from what we
1086 # retrieved. First we extract the information common to all extractors,
1087 # and latter we extract those that are Vimeo specific.
1088 self.report_extraction(video_id)
1090 # Extract the config JSON
1092 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1093 config = json.loads(config)
1095 self._downloader.trouble(u'ERROR: unable to extract info section')
1099 video_title = config["video"]["title"]
1101 # Extract uploader and uploader_id
1102 video_uploader = config["video"]["owner"]["name"]
1103 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1105 # Extract video thumbnail
1106 video_thumbnail = config["video"]["thumbnail"]
1108 # Extract video description
1109 video_description = get_element_by_attribute("itemprop", "description", webpage)
1110 if video_description: video_description = clean_html(video_description)
1111 else: video_description = ''
1113 # Extract upload date
1114 video_upload_date = None
1115 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1116 if mobj is not None:
1117 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1119 # Vimeo specific: extract request signature and timestamp
1120 sig = config['request']['signature']
1121 timestamp = config['request']['timestamp']
1123 # Vimeo specific: extract video codec and quality information
1124 # First consider quality, then codecs, then take everything
1125 # TODO bind to format param
1126 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1127 files = { 'hd': [], 'sd': [], 'other': []}
1128 for codec_name, codec_extension in codecs:
1129 if codec_name in config["video"]["files"]:
1130 if 'hd' in config["video"]["files"][codec_name]:
1131 files['hd'].append((codec_name, codec_extension, 'hd'))
1132 elif 'sd' in config["video"]["files"][codec_name]:
1133 files['sd'].append((codec_name, codec_extension, 'sd'))
1135 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1137 for quality in ('hd', 'sd', 'other'):
1138 if len(files[quality]) > 0:
1139 video_quality = files[quality][0][2]
1140 video_codec = files[quality][0][0]
1141 video_extension = files[quality][0][1]
1142 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1145 self._downloader.trouble(u'ERROR: no known codec found')
1148 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1149 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1154 'uploader': video_uploader,
1155 'uploader_id': video_uploader_id,
1156 'upload_date': video_upload_date,
1157 'title': video_title,
1158 'ext': video_extension,
1159 'thumbnail': video_thumbnail,
1160 'description': video_description,
1164 class ArteTvIE(InfoExtractor):
1165 """arte.tv information extractor."""
1167 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1168 _LIVE_URL = r'index-[0-9]+\.html$'
1170 IE_NAME = u'arte.tv'
1172 def __init__(self, downloader=None):
1173 InfoExtractor.__init__(self, downloader)
1175 def report_download_webpage(self, video_id):
1176 """Report webpage download."""
1177 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1179 def report_extraction(self, video_id):
1180 """Report information extraction."""
1181 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1183 def fetch_webpage(self, url):
1184 request = compat_urllib_request.Request(url)
1186 self.report_download_webpage(url)
1187 webpage = compat_urllib_request.urlopen(request).read()
1188 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1189 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1191 except ValueError as err:
1192 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1197 page = self.fetch_webpage(url)
1198 mobj = re.search(regex, page, regexFlags)
1202 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1205 for (i, key, err) in matchTuples:
1206 if mobj.group(i) is None:
1207 self._downloader.trouble(err)
1210 info[key] = mobj.group(i)
1214 def extractLiveStream(self, url):
1215 video_lang = url.split('/')[-4]
1216 info = self.grep_webpage(
1218 r'src="(.*?/videothek_js.*?\.js)',
1221 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1224 http_host = url.split('/')[2]
1225 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1226 info = self.grep_webpage(
1228 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1229 '(http://.*?\.swf).*?' +
1233 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1234 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1235 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1238 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1240 def extractPlus7Stream(self, url):
1241 video_lang = url.split('/')[-3]
1242 info = self.grep_webpage(
1244 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1247 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1250 next_url = compat_urllib_parse.unquote(info.get('url'))
1251 info = self.grep_webpage(
1253 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1256 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1259 next_url = compat_urllib_parse.unquote(info.get('url'))
1261 info = self.grep_webpage(
1263 r'<video id="(.*?)".*?>.*?' +
1264 '<name>(.*?)</name>.*?' +
1265 '<dateVideo>(.*?)</dateVideo>.*?' +
1266 '<url quality="hd">(.*?)</url>',
1269 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1270 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1271 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1272 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1277 'id': info.get('id'),
1278 'url': compat_urllib_parse.unquote(info.get('url')),
1279 'uploader': u'arte.tv',
1280 'upload_date': info.get('date'),
1281 'title': info.get('title').decode('utf-8'),
1287 def _real_extract(self, url):
1288 video_id = url.split('/')[-1]
1289 self.report_extraction(video_id)
1291 if re.search(self._LIVE_URL, video_id) is not None:
1292 self.extractLiveStream(url)
1295 info = self.extractPlus7Stream(url)
1300 class GenericIE(InfoExtractor):
1301 """Generic last-resort information extractor."""
1304 IE_NAME = u'generic'
1306 def __init__(self, downloader=None):
1307 InfoExtractor.__init__(self, downloader)
1309 def report_download_webpage(self, video_id):
1310 """Report webpage download."""
1311 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1312 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1314 def report_extraction(self, video_id):
1315 """Report information extraction."""
1316 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1318 def report_following_redirect(self, new_url):
1319 """Report information extraction."""
1320 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1322 def _test_redirect(self, url):
1323 """Check if it is a redirect, like url shorteners, in case restart chain."""
1324 class HeadRequest(compat_urllib_request.Request):
1325 def get_method(self):
1328 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1330 Subclass the HTTPRedirectHandler to make it use our
1331 HeadRequest also on the redirected URL
1333 def redirect_request(self, req, fp, code, msg, headers, newurl):
1334 if code in (301, 302, 303, 307):
1335 newurl = newurl.replace(' ', '%20')
1336 newheaders = dict((k,v) for k,v in req.headers.items()
1337 if k.lower() not in ("content-length", "content-type"))
1338 return HeadRequest(newurl,
1340 origin_req_host=req.get_origin_req_host(),
1343 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1345 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1347 Fallback to GET if HEAD is not allowed (405 HTTP error)
1349 def http_error_405(self, req, fp, code, msg, headers):
1353 newheaders = dict((k,v) for k,v in req.headers.items()
1354 if k.lower() not in ("content-length", "content-type"))
1355 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1357 origin_req_host=req.get_origin_req_host(),
1361 opener = compat_urllib_request.OpenerDirector()
1362 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1363 HTTPMethodFallback, HEADRedirectHandler,
1364 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1365 opener.add_handler(handler())
1367 response = opener.open(HeadRequest(url))
1368 new_url = response.geturl()
1373 self.report_following_redirect(new_url)
1374 self._downloader.download([new_url])
1377 def _real_extract(self, url):
1378 if self._test_redirect(url): return
1380 video_id = url.split('/')[-1]
1381 request = compat_urllib_request.Request(url)
1383 self.report_download_webpage(video_id)
1384 webpage = compat_urllib_request.urlopen(request).read()
1385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1386 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1388 except ValueError as err:
1389 # since this is the last-resort InfoExtractor, if
1390 # this error is thrown, it'll be thrown here
1391 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1394 self.report_extraction(video_id)
1395 # Start with something easy: JW Player in SWFObject
1396 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1398 # Broaden the search a little bit
1399 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1401 # Broaden the search a little bit: JWPlayer JS loader
1402 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1404 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1407 # It's possible that one of the regexes
1408 # matched, but returned an empty group:
1409 if mobj.group(1) is None:
1410 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1413 video_url = compat_urllib_parse.unquote(mobj.group(1))
1414 video_id = os.path.basename(video_url)
1416 # here's a fun little line of code for you:
1417 video_extension = os.path.splitext(video_id)[1][1:]
1418 video_id = os.path.splitext(video_id)[0]
1420 # it's tempting to parse this further, but you would
1421 # have to take into account all the variations like
1422 # Video Title - Site Name
1423 # Site Name | Video Title
1424 # Video Title - Tagline | Site Name
1425 # and so on and so forth; it's just not practical
1426 mobj = re.search(r'<title>(.*)</title>', webpage)
1428 self._downloader.trouble(u'ERROR: unable to extract title')
1430 video_title = mobj.group(1)
1432 # video uploader is domain name
1433 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1435 self._downloader.trouble(u'ERROR: unable to extract title')
1437 video_uploader = mobj.group(1)
1442 'uploader': video_uploader,
1443 'upload_date': None,
1444 'title': video_title,
1445 'ext': video_extension,
1449 class YoutubeSearchIE(InfoExtractor):
1450 """Information Extractor for YouTube search queries."""
1451 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1452 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1453 _max_youtube_results = 1000
1454 IE_NAME = u'youtube:search'
1456 def __init__(self, downloader=None):
1457 InfoExtractor.__init__(self, downloader)
1459 def report_download_page(self, query, pagenum):
1460 """Report attempt to download search page with given number."""
1461 query = query.decode(preferredencoding())
1462 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1464 def _real_extract(self, query):
1465 mobj = re.match(self._VALID_URL, query)
1467 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1470 prefix, query = query.split(':')
1472 query = query.encode('utf-8')
1474 self._download_n_results(query, 1)
1476 elif prefix == 'all':
1477 self._download_n_results(query, self._max_youtube_results)
1483 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1485 elif n > self._max_youtube_results:
1486 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1487 n = self._max_youtube_results
1488 self._download_n_results(query, n)
1490 except ValueError: # parsing prefix as integer fails
1491 self._download_n_results(query, 1)
1494 def _download_n_results(self, query, n):
1495 """Downloads a specified number of results for a query"""
1501 while (50 * pagenum) < limit:
1502 self.report_download_page(query, pagenum+1)
1503 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1504 request = compat_urllib_request.Request(result_url)
1506 data = compat_urllib_request.urlopen(request).read()
1507 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1508 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1510 api_response = json.loads(data)['data']
1512 new_ids = list(video['id'] for video in api_response['items'])
1513 video_ids += new_ids
1515 limit = min(n, api_response['totalItems'])
1518 if len(video_ids) > n:
1519 video_ids = video_ids[:n]
1520 for id in video_ids:
1521 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1525 class GoogleSearchIE(InfoExtractor):
1526 """Information Extractor for Google Video search queries."""
1527 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1528 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1529 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1530 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1531 _max_google_results = 1000
1532 IE_NAME = u'video.google:search'
1534 def __init__(self, downloader=None):
1535 InfoExtractor.__init__(self, downloader)
1537 def report_download_page(self, query, pagenum):
1538 """Report attempt to download playlist page with given number."""
1539 query = query.decode(preferredencoding())
1540 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1542 def _real_extract(self, query):
1543 mobj = re.match(self._VALID_URL, query)
1545 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1548 prefix, query = query.split(':')
1550 query = query.encode('utf-8')
1552 self._download_n_results(query, 1)
1554 elif prefix == 'all':
1555 self._download_n_results(query, self._max_google_results)
1561 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1563 elif n > self._max_google_results:
1564 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1565 n = self._max_google_results
1566 self._download_n_results(query, n)
1568 except ValueError: # parsing prefix as integer fails
1569 self._download_n_results(query, 1)
1572 def _download_n_results(self, query, n):
1573 """Downloads a specified number of results for a query"""
1579 self.report_download_page(query, pagenum)
1580 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1581 request = compat_urllib_request.Request(result_url)
1583 page = compat_urllib_request.urlopen(request).read()
1584 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1585 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1588 # Extract video identifiers
1589 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1590 video_id = mobj.group(1)
1591 if video_id not in video_ids:
1592 video_ids.append(video_id)
1593 if len(video_ids) == n:
1594 # Specified n videos reached
1595 for id in video_ids:
1596 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1599 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1600 for id in video_ids:
1601 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1604 pagenum = pagenum + 1
1607 class YahooSearchIE(InfoExtractor):
1608 """Information Extractor for Yahoo! Video search queries."""
1611 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1612 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1613 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1614 _MORE_PAGES_INDICATOR = r'\s*Next'
1615 _max_yahoo_results = 1000
1616 IE_NAME = u'video.yahoo:search'
1618 def __init__(self, downloader=None):
1619 InfoExtractor.__init__(self, downloader)
1621 def report_download_page(self, query, pagenum):
1622 """Report attempt to download playlist page with given number."""
1623 query = query.decode(preferredencoding())
1624 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1626 def _real_extract(self, query):
1627 mobj = re.match(self._VALID_URL, query)
1629 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1632 prefix, query = query.split(':')
1634 query = query.encode('utf-8')
1636 self._download_n_results(query, 1)
1638 elif prefix == 'all':
1639 self._download_n_results(query, self._max_yahoo_results)
1645 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1647 elif n > self._max_yahoo_results:
1648 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1649 n = self._max_yahoo_results
1650 self._download_n_results(query, n)
1652 except ValueError: # parsing prefix as integer fails
1653 self._download_n_results(query, 1)
1656 def _download_n_results(self, query, n):
1657 """Downloads a specified number of results for a query"""
1660 already_seen = set()
1664 self.report_download_page(query, pagenum)
1665 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1666 request = compat_urllib_request.Request(result_url)
1668 page = compat_urllib_request.urlopen(request).read()
1669 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1670 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1673 # Extract video identifiers
1674 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1675 video_id = mobj.group(1)
1676 if video_id not in already_seen:
1677 video_ids.append(video_id)
1678 already_seen.add(video_id)
1679 if len(video_ids) == n:
1680 # Specified n videos reached
1681 for id in video_ids:
1682 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1685 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1686 for id in video_ids:
1687 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1690 pagenum = pagenum + 1
1693 class YoutubePlaylistIE(InfoExtractor):
1694 """Information Extractor for YouTube playlists."""
1696 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1697 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1698 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1699 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1700 IE_NAME = u'youtube:playlist'
1702 def __init__(self, downloader=None):
1703 InfoExtractor.__init__(self, downloader)
1705 def report_download_page(self, playlist_id, pagenum):
1706 """Report attempt to download playlist page with given number."""
1707 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1709 def _real_extract(self, url):
1710 # Extract playlist id
1711 mobj = re.match(self._VALID_URL, url)
1713 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1717 if mobj.group(3) is not None:
1718 self._downloader.download([mobj.group(3)])
1721 # Download playlist pages
1722 # prefix is 'p' as default for playlists but there are other types that need extra care
1723 playlist_prefix = mobj.group(1)
1724 if playlist_prefix == 'a':
1725 playlist_access = 'artist'
1727 playlist_prefix = 'p'
1728 playlist_access = 'view_play_list'
1729 playlist_id = mobj.group(2)
1734 self.report_download_page(playlist_id, pagenum)
1735 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1736 request = compat_urllib_request.Request(url)
1738 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1739 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1740 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1743 # Extract video identifiers
1745 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1746 if mobj.group(1) not in ids_in_page:
1747 ids_in_page.append(mobj.group(1))
1748 video_ids.extend(ids_in_page)
1750 if self._MORE_PAGES_INDICATOR not in page:
1752 pagenum = pagenum + 1
1754 total = len(video_ids)
1756 playliststart = self._downloader.params.get('playliststart', 1) - 1
1757 playlistend = self._downloader.params.get('playlistend', -1)
1758 if playlistend == -1:
1759 video_ids = video_ids[playliststart:]
1761 video_ids = video_ids[playliststart:playlistend]
1763 if len(video_ids) == total:
1764 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1766 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1768 for id in video_ids:
1769 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1773 class YoutubeChannelIE(InfoExtractor):
1774 """Information Extractor for YouTube channels."""
1776 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1777 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1778 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1779 IE_NAME = u'youtube:channel'
1781 def report_download_page(self, channel_id, pagenum):
1782 """Report attempt to download channel page with given number."""
1783 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1785 def _real_extract(self, url):
1786 # Extract channel id
1787 mobj = re.match(self._VALID_URL, url)
1789 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1792 # Download channel pages
1793 channel_id = mobj.group(1)
1798 self.report_download_page(channel_id, pagenum)
1799 url = self._TEMPLATE_URL % (channel_id, pagenum)
1800 request = compat_urllib_request.Request(url)
1802 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1803 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1804 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1807 # Extract video identifiers
1809 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1810 if mobj.group(1) not in ids_in_page:
1811 ids_in_page.append(mobj.group(1))
1812 video_ids.extend(ids_in_page)
1814 if self._MORE_PAGES_INDICATOR not in page:
1816 pagenum = pagenum + 1
1818 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1820 for id in video_ids:
1821 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1825 class YoutubeUserIE(InfoExtractor):
1826 """Information Extractor for YouTube users."""
1828 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1829 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1830 _GDATA_PAGE_SIZE = 50
1831 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1832 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1833 IE_NAME = u'youtube:user'
1835 def __init__(self, downloader=None):
1836 InfoExtractor.__init__(self, downloader)
1838 def report_download_page(self, username, start_index):
1839 """Report attempt to download user page."""
1840 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1841 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1843 def _real_extract(self, url):
1845 mobj = re.match(self._VALID_URL, url)
1847 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1850 username = mobj.group(1)
1852 # Download video ids using YouTube Data API. Result size per
1853 # query is limited (currently to 50 videos) so we need to query
1854 # page by page until there are no video ids - it means we got
1861 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1862 self.report_download_page(username, start_index)
1864 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1867 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1869 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1872 # Extract video identifiers
1875 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1876 if mobj.group(1) not in ids_in_page:
1877 ids_in_page.append(mobj.group(1))
1879 video_ids.extend(ids_in_page)
1881 # A little optimization - if current page is not
1882 # "full", ie. does not contain PAGE_SIZE video ids then
1883 # we can assume that this page is the last one - there
1884 # are no more ids on further pages - no need to query
1887 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1892 all_ids_count = len(video_ids)
1893 playliststart = self._downloader.params.get('playliststart', 1) - 1
1894 playlistend = self._downloader.params.get('playlistend', -1)
1896 if playlistend == -1:
1897 video_ids = video_ids[playliststart:]
1899 video_ids = video_ids[playliststart:playlistend]
1901 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1902 (username, all_ids_count, len(video_ids)))
1904 for video_id in video_ids:
1905 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1908 class BlipTVUserIE(InfoExtractor):
1909 """Information Extractor for blip.tv users."""
1911 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1913 IE_NAME = u'blip.tv:user'
1915 def __init__(self, downloader=None):
1916 InfoExtractor.__init__(self, downloader)
1918 def report_download_page(self, username, pagenum):
1919 """Report attempt to download user page."""
1920 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1921 (self.IE_NAME, username, pagenum))
1923 def _real_extract(self, url):
1925 mobj = re.match(self._VALID_URL, url)
1927 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1930 username = mobj.group(1)
1932 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1934 request = compat_urllib_request.Request(url)
1937 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1938 mobj = re.search(r'data-users-id="([^"]+)"', page)
1939 page_base = page_base % mobj.group(1)
1940 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1945 # Download video ids using BlipTV Ajax calls. Result size per
1946 # query is limited (currently to 12 videos) so we need to query
1947 # page by page until there are no video ids - it means we got
1954 self.report_download_page(username, pagenum)
1956 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1959 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1961 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1964 # Extract video identifiers
1967 for mobj in re.finditer(r'href="/([^"]+)"', page):
1968 if mobj.group(1) not in ids_in_page:
1969 ids_in_page.append(unescapeHTML(mobj.group(1)))
1971 video_ids.extend(ids_in_page)
1973 # A little optimization - if current page is not
1974 # "full", ie. does not contain PAGE_SIZE video ids then
1975 # we can assume that this page is the last one - there
1976 # are no more ids on further pages - no need to query
1979 if len(ids_in_page) < self._PAGE_SIZE:
1984 all_ids_count = len(video_ids)
1985 playliststart = self._downloader.params.get('playliststart', 1) - 1
1986 playlistend = self._downloader.params.get('playlistend', -1)
1988 if playlistend == -1:
1989 video_ids = video_ids[playliststart:]
1991 video_ids = video_ids[playliststart:playlistend]
1993 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1994 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1996 for video_id in video_ids:
1997 self._downloader.download([u'http://blip.tv/'+video_id])
2000 class DepositFilesIE(InfoExtractor):
2001 """Information extractor for depositfiles.com"""
2003 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2005 def report_download_webpage(self, file_id):
2006 """Report webpage download."""
2007 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2009 def report_extraction(self, file_id):
2010 """Report information extraction."""
2011 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2013 def _real_extract(self, url):
2014 file_id = url.split('/')[-1]
2015 # Rebuild url in english locale
2016 url = 'http://depositfiles.com/en/files/' + file_id
2018 # Retrieve file webpage with 'Free download' button pressed
2019 free_download_indication = { 'gateway_result' : '1' }
2020 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2022 self.report_download_webpage(file_id)
2023 webpage = compat_urllib_request.urlopen(request).read()
2024 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2025 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2028 # Search for the real file URL
2029 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2030 if (mobj is None) or (mobj.group(1) is None):
2031 # Try to figure out reason of the error.
2032 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2033 if (mobj is not None) and (mobj.group(1) is not None):
2034 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2035 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2037 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2040 file_url = mobj.group(1)
2041 file_extension = os.path.splitext(file_url)[1][1:]
2043 # Search for file title
2044 mobj = re.search(r'<b title="(.*?)">', webpage)
2046 self._downloader.trouble(u'ERROR: unable to extract title')
2048 file_title = mobj.group(1).decode('utf-8')
2051 'id': file_id.decode('utf-8'),
2052 'url': file_url.decode('utf-8'),
2054 'upload_date': None,
2055 'title': file_title,
2056 'ext': file_extension.decode('utf-8'),
2060 class FacebookIE(InfoExtractor):
2061 """Information Extractor for Facebook"""
2063 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2064 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2065 _NETRC_MACHINE = 'facebook'
2066 IE_NAME = u'facebook'
2068 def report_login(self):
2069 """Report attempt to log in."""
2070 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2072 def _real_initialize(self):
2073 if self._downloader is None:
2078 downloader_params = self._downloader.params
2080 # Attempt to use provided username and password or .netrc data
2081 if downloader_params.get('username', None) is not None:
2082 useremail = downloader_params['username']
2083 password = downloader_params['password']
2084 elif downloader_params.get('usenetrc', False):
2086 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2087 if info is not None:
2091 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2092 except (IOError, netrc.NetrcParseError) as err:
2093 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2096 if useremail is None:
2105 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2108 login_results = compat_urllib_request.urlopen(request).read()
2109 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2110 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2113 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2116 def _real_extract(self, url):
2117 mobj = re.match(self._VALID_URL, url)
2119 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2121 video_id = mobj.group('ID')
2123 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2124 webpage = self._download_webpage(url, video_id)
2126 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2127 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2128 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2130 raise ExtractorError(u'Cannot parse data')
2131 data = dict(json.loads(m.group(1)))
2132 params_raw = compat_urllib_parse.unquote(data['params'])
2133 params = json.loads(params_raw)
2134 video_url = params['hd_src']
2136 video_url = params['sd_src']
2138 raise ExtractorError(u'Cannot find video URL')
2139 video_duration = int(params['video_duration'])
2141 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2143 raise ExtractorError(u'Cannot find title in webpage')
2144 video_title = unescapeHTML(m.group(1))
2148 'title': video_title,
2151 'duration': video_duration,
2152 'thumbnail': params['thumbnail_src'],
2157 class BlipTVIE(InfoExtractor):
2158 """Information extractor for blip.tv"""
2160 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2161 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2162 IE_NAME = u'blip.tv'
2164 def report_extraction(self, file_id):
2165 """Report information extraction."""
2166 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2168 def report_direct_download(self, title):
2169 """Report information extraction."""
2170 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2172 def _real_extract(self, url):
2173 mobj = re.match(self._VALID_URL, url)
2175 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2182 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2183 request = compat_urllib_request.Request(json_url)
2184 request.add_header('User-Agent', 'iTunes/10.6.1')
2185 self.report_extraction(mobj.group(1))
2188 urlh = compat_urllib_request.urlopen(request)
2189 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2190 basename = url.split('/')[-1]
2191 title,ext = os.path.splitext(basename)
2192 title = title.decode('UTF-8')
2193 ext = ext.replace('.', '')
2194 self.report_direct_download(title)
2199 'upload_date': None,
2204 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2205 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2206 if info is None: # Regular URL
2208 json_code_bytes = urlh.read()
2209 json_code = json_code_bytes.decode('utf-8')
2210 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2211 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2215 json_data = json.loads(json_code)
2216 if 'Post' in json_data:
2217 data = json_data['Post']
2221 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2222 video_url = data['media']['url']
2223 umobj = re.match(self._URL_EXT, video_url)
2225 raise ValueError('Can not determine filename extension')
2226 ext = umobj.group(1)
2229 'id': data['item_id'],
2231 'uploader': data['display_name'],
2232 'upload_date': upload_date,
2233 'title': data['title'],
2235 'format': data['media']['mimeType'],
2236 'thumbnail': data['thumbnailUrl'],
2237 'description': data['description'],
2238 'player_url': data['embedUrl'],
2239 'user_agent': 'iTunes/10.6.1',
2241 except (ValueError,KeyError) as err:
2242 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2248 class MyVideoIE(InfoExtractor):
2249 """Information Extractor for myvideo.de."""
2251 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2252 IE_NAME = u'myvideo'
2254 def __init__(self, downloader=None):
2255 InfoExtractor.__init__(self, downloader)
2257 def report_extraction(self, video_id):
2258 """Report information extraction."""
2259 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2261 def _real_extract(self,url):
2262 mobj = re.match(self._VALID_URL, url)
2264 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2267 video_id = mobj.group(1)
2270 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2271 webpage = self._download_webpage(webpage_url, video_id)
2273 self.report_extraction(video_id)
2274 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2277 self._downloader.trouble(u'ERROR: unable to extract media URL')
2279 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2281 mobj = re.search('<title>([^<]+)</title>', webpage)
2283 self._downloader.trouble(u'ERROR: unable to extract title')
2286 video_title = mobj.group(1)
2292 'upload_date': None,
2293 'title': video_title,
2297 class ComedyCentralIE(InfoExtractor):
2298 """Information extractor for The Daily Show and Colbert Report """
2300 # urls can be abbreviations like :thedailyshow or :colbert
2301 # urls for episodes like:
2302 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2303 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2304 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2305 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2306 |(https?://)?(www\.)?
2307 (?P<showname>thedailyshow|colbertnation)\.com/
2308 (full-episodes/(?P<episode>.*)|
2310 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2311 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2314 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2316 _video_extensions = {
2324 _video_dimensions = {
2333 def suitable(self, url):
2334 """Receives a URL and returns True if suitable for this IE."""
2335 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2337 def report_extraction(self, episode_id):
2338 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2340 def report_config_download(self, episode_id, media_id):
2341 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2343 def report_index_download(self, episode_id):
2344 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2346 def _print_formats(self, formats):
2347 print('Available formats:')
2349 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2352 def _real_extract(self, url):
2353 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2355 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2358 if mobj.group('shortname'):
2359 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2360 url = u'http://www.thedailyshow.com/full-episodes/'
2362 url = u'http://www.colbertnation.com/full-episodes/'
2363 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2364 assert mobj is not None
2366 if mobj.group('clip'):
2367 if mobj.group('showname') == 'thedailyshow':
2368 epTitle = mobj.group('tdstitle')
2370 epTitle = mobj.group('cntitle')
2373 dlNewest = not mobj.group('episode')
2375 epTitle = mobj.group('showname')
2377 epTitle = mobj.group('episode')
2379 req = compat_urllib_request.Request(url)
2380 self.report_extraction(epTitle)
2382 htmlHandle = compat_urllib_request.urlopen(req)
2383 html = htmlHandle.read()
2384 webpage = html.decode('utf-8')
2385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2386 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2389 url = htmlHandle.geturl()
2390 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2394 if mobj.group('episode') == '':
2395 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2397 epTitle = mobj.group('episode')
2399 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2401 if len(mMovieParams) == 0:
2402 # The Colbert Report embeds the information in a without
2403 # a URL prefix; so extract the alternate reference
2404 # and then add the URL prefix manually.
2406 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2407 if len(altMovieParams) == 0:
2408 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2411 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2413 uri = mMovieParams[0][1]
2414 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2415 self.report_index_download(epTitle)
2417 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2418 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2419 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2424 idoc = xml.etree.ElementTree.fromstring(indexXml)
2425 itemEls = idoc.findall('.//item')
2426 for partNum,itemEl in enumerate(itemEls):
2427 mediaId = itemEl.findall('./guid')[0].text
2428 shortMediaId = mediaId.split(':')[-1]
2429 showId = mediaId.split(':')[-2].replace('.com', '')
2430 officialTitle = itemEl.findall('./title')[0].text
2431 officialDate = itemEl.findall('./pubDate')[0].text
2433 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2434 compat_urllib_parse.urlencode({'uri': mediaId}))
2435 configReq = compat_urllib_request.Request(configUrl)
2436 self.report_config_download(epTitle, shortMediaId)
2438 configXml = compat_urllib_request.urlopen(configReq).read()
2439 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2440 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2443 cdoc = xml.etree.ElementTree.fromstring(configXml)
2445 for rendition in cdoc.findall('.//rendition'):
2446 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2450 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2453 if self._downloader.params.get('listformats', None):
2454 self._print_formats([i[0] for i in turls])
2457 # For now, just pick the highest bitrate
2458 format,rtmp_video_url = turls[-1]
2460 # Get the format arg from the arg stream
2461 req_format = self._downloader.params.get('format', None)
2463 # Select format if we can find one
2466 format, rtmp_video_url = f, v
2469 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2471 raise ExtractorError(u'Cannot transform RTMP url')
2472 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2473 video_url = base + m.group('finalid')
2475 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2480 'upload_date': officialDate,
2485 'description': officialTitle,
2487 results.append(info)
2492 class EscapistIE(InfoExtractor):
2493 """Information extractor for The Escapist """
2495 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2496 IE_NAME = u'escapist'
2498 def report_extraction(self, showName):
2499 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2501 def report_config_download(self, showName):
2502 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2504 def _real_extract(self, url):
2505 mobj = re.match(self._VALID_URL, url)
2507 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2509 showName = mobj.group('showname')
2510 videoId = mobj.group('episode')
2512 self.report_extraction(showName)
2514 webPage = compat_urllib_request.urlopen(url)
2515 webPageBytes = webPage.read()
2516 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2517 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2518 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2519 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2522 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2523 description = unescapeHTML(descMatch.group(1))
2524 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2525 imgUrl = unescapeHTML(imgMatch.group(1))
2526 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2527 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2528 configUrlMatch = re.search('config=(.*)$', playerUrl)
2529 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2531 self.report_config_download(showName)
2533 configJSON = compat_urllib_request.urlopen(configUrl)
2534 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2535 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2537 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2540 # Technically, it's JavaScript, not JSON
2541 configJSON = configJSON.replace("'", '"')
2544 config = json.loads(configJSON)
2545 except (ValueError,) as err:
2546 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2549 playlist = config['playlist']
2550 videoUrl = playlist[1]['url']
2555 'uploader': showName,
2556 'upload_date': None,
2559 'thumbnail': imgUrl,
2560 'description': description,
2561 'player_url': playerUrl,
2566 class CollegeHumorIE(InfoExtractor):
2567 """Information extractor for collegehumor.com"""
2570 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2571 IE_NAME = u'collegehumor'
2573 def report_manifest(self, video_id):
2574 """Report information extraction."""
2575 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2577 def report_extraction(self, video_id):
2578 """Report information extraction."""
2579 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2581 def _real_extract(self, url):
2582 mobj = re.match(self._VALID_URL, url)
2584 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2586 video_id = mobj.group('videoid')
2591 'upload_date': None,
2594 self.report_extraction(video_id)
2595 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2597 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2598 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2599 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2602 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2604 videoNode = mdoc.findall('./video')[0]
2605 info['description'] = videoNode.findall('./description')[0].text
2606 info['title'] = videoNode.findall('./caption')[0].text
2607 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2608 manifest_url = videoNode.findall('./file')[0].text
2610 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2613 manifest_url += '?hdcore=2.10.3'
2614 self.report_manifest(video_id)
2616 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2617 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2618 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2621 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2623 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2624 node_id = media_node.attrib['url']
2625 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2626 except IndexError as err:
2627 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2630 url_pr = compat_urllib_parse_urlparse(manifest_url)
2631 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2638 class XVideosIE(InfoExtractor):
2639 """Information extractor for xvideos.com"""
2641 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2642 IE_NAME = u'xvideos'
2644 def report_extraction(self, video_id):
2645 """Report information extraction."""
2646 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2648 def _real_extract(self, url):
2649 mobj = re.match(self._VALID_URL, url)
2651 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2653 video_id = mobj.group(1)
2655 webpage = self._download_webpage(url, video_id)
2657 self.report_extraction(video_id)
2661 mobj = re.search(r'flv_url=(.+?)&', webpage)
2663 self._downloader.trouble(u'ERROR: unable to extract video url')
2665 video_url = compat_urllib_parse.unquote(mobj.group(1))
2669 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2671 self._downloader.trouble(u'ERROR: unable to extract video title')
2673 video_title = mobj.group(1)
2676 # Extract video thumbnail
2677 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2679 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2681 video_thumbnail = mobj.group(0)
2687 'upload_date': None,
2688 'title': video_title,
2690 'thumbnail': video_thumbnail,
2691 'description': None,
2697 class SoundcloudIE(InfoExtractor):
2698 """Information extractor for soundcloud.com
2699 To access the media, the uid of the song and a stream token
2700 must be extracted from the page source and the script must make
2701 a request to media.soundcloud.com/crossdomain.xml. Then
2702 the media can be grabbed by requesting from an url composed
2703 of the stream token and uid
2706 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2707 IE_NAME = u'soundcloud'
2709 def __init__(self, downloader=None):
2710 InfoExtractor.__init__(self, downloader)
2712 def report_resolve(self, video_id):
2713 """Report information extraction."""
2714 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2716 def report_extraction(self, video_id):
2717 """Report information extraction."""
2718 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2720 def _real_extract(self, url):
2721 mobj = re.match(self._VALID_URL, url)
2723 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2726 # extract uploader (which is in the url)
2727 uploader = mobj.group(1)
2728 # extract simple title (uploader + slug of song title)
2729 slug_title = mobj.group(2)
2730 simple_title = uploader + u'-' + slug_title
2732 self.report_resolve('%s/%s' % (uploader, slug_title))
2734 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2735 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736 request = compat_urllib_request.Request(resolv_url)
2738 info_json_bytes = compat_urllib_request.urlopen(request).read()
2739 info_json = info_json_bytes.decode('utf-8')
2740 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2744 info = json.loads(info_json)
2745 video_id = info['id']
2746 self.report_extraction('%s/%s' % (uploader, slug_title))
2748 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2749 request = compat_urllib_request.Request(streams_url)
2751 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2752 stream_json = stream_json_bytes.decode('utf-8')
2753 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2757 streams = json.loads(stream_json)
2758 mediaURL = streams['http_mp3_128_url']
2763 'uploader': info['user']['username'],
2764 'upload_date': info['created_at'],
2765 'title': info['title'],
2767 'description': info['description'],
2771 class InfoQIE(InfoExtractor):
2772 """Information extractor for infoq.com"""
2773 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2775 def report_extraction(self, video_id):
2776 """Report information extraction."""
2777 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2779 def _real_extract(self, url):
2780 mobj = re.match(self._VALID_URL, url)
2782 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2785 webpage = self._download_webpage(url, video_id=url)
2786 self.report_extraction(url)
2789 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2791 self._downloader.trouble(u'ERROR: unable to extract video url')
2793 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2794 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2797 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2799 self._downloader.trouble(u'ERROR: unable to extract video title')
2801 video_title = mobj.group(1)
2803 # Extract description
2804 video_description = u'No description available.'
2805 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2806 if mobj is not None:
2807 video_description = mobj.group(1)
2809 video_filename = video_url.split('/')[-1]
2810 video_id, extension = video_filename.split('.')
2816 'upload_date': None,
2817 'title': video_title,
2818 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2820 'description': video_description,
2825 class MixcloudIE(InfoExtractor):
2826 """Information extractor for www.mixcloud.com"""
2828 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2829 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2830 IE_NAME = u'mixcloud'
2832 def __init__(self, downloader=None):
2833 InfoExtractor.__init__(self, downloader)
2835 def report_download_json(self, file_id):
2836 """Report JSON download."""
2837 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2839 def report_extraction(self, file_id):
2840 """Report information extraction."""
2841 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2843 def get_urls(self, jsonData, fmt, bitrate='best'):
2844 """Get urls from 'audio_formats' section in json"""
2847 bitrate_list = jsonData[fmt]
2848 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2849 bitrate = max(bitrate_list) # select highest
2851 url_list = jsonData[fmt][bitrate]
2852 except TypeError: # we have no bitrate info.
2853 url_list = jsonData[fmt]
2856 def check_urls(self, url_list):
2857 """Returns 1st active url from list"""
2858 for url in url_list:
2860 compat_urllib_request.urlopen(url)
2862 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867 def _print_formats(self, formats):
2868 print('Available formats:')
2869 for fmt in formats.keys():
2870 for b in formats[fmt]:
2872 ext = formats[fmt][b][0]
2873 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2874 except TypeError: # we have no bitrate info
2875 ext = formats[fmt][0]
2876 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2879 def _real_extract(self, url):
2880 mobj = re.match(self._VALID_URL, url)
2882 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2884 # extract uploader & filename from url
2885 uploader = mobj.group(1).decode('utf-8')
2886 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2888 # construct API request
2889 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2890 # retrieve .json file with links to files
2891 request = compat_urllib_request.Request(file_url)
2893 self.report_download_json(file_url)
2894 jsonData = compat_urllib_request.urlopen(request).read()
2895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2900 json_data = json.loads(jsonData)
2901 player_url = json_data['player_swf_url']
2902 formats = dict(json_data['audio_formats'])
2904 req_format = self._downloader.params.get('format', None)
2907 if self._downloader.params.get('listformats', None):
2908 self._print_formats(formats)
2911 if req_format is None or req_format == 'best':
2912 for format_param in formats.keys():
2913 url_list = self.get_urls(formats, format_param)
2915 file_url = self.check_urls(url_list)
2916 if file_url is not None:
2919 if req_format not in formats:
2920 self._downloader.trouble(u'ERROR: format is not available')
2923 url_list = self.get_urls(formats, req_format)
2924 file_url = self.check_urls(url_list)
2925 format_param = req_format
2928 'id': file_id.decode('utf-8'),
2929 'url': file_url.decode('utf-8'),
2930 'uploader': uploader.decode('utf-8'),
2931 'upload_date': None,
2932 'title': json_data['name'],
2933 'ext': file_url.split('.')[-1].decode('utf-8'),
2934 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2935 'thumbnail': json_data['thumbnail_url'],
2936 'description': json_data['description'],
2937 'player_url': player_url.decode('utf-8'),
2940 class StanfordOpenClassroomIE(InfoExtractor):
2941 """Information extractor for Stanford's Open ClassRoom"""
2943 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2944 IE_NAME = u'stanfordoc'
2946 def report_download_webpage(self, objid):
2947 """Report information extraction."""
2948 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2950 def report_extraction(self, video_id):
2951 """Report information extraction."""
2952 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2954 def _real_extract(self, url):
2955 mobj = re.match(self._VALID_URL, url)
2957 raise ExtractorError(u'Invalid URL: %s' % url)
2959 if mobj.group('course') and mobj.group('video'): # A specific video
2960 course = mobj.group('course')
2961 video = mobj.group('video')
2963 'id': course + '_' + video,
2965 'upload_date': None,
2968 self.report_extraction(info['id'])
2969 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2970 xmlUrl = baseUrl + video + '.xml'
2972 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2973 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2974 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2976 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2978 info['title'] = mdoc.findall('./title')[0].text
2979 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2981 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2983 info['ext'] = info['url'].rpartition('.')[2]
2985 elif mobj.group('course'): # A course page
2986 course = mobj.group('course')
2991 'upload_date': None,
2994 coursepage = self._download_webpage(url, info['id'],
2995 note='Downloading course info page',
2996 errnote='Unable to download course info page')
2998 m = re.search('<h1>([^<]+)</h1>', coursepage)
3000 info['title'] = unescapeHTML(m.group(1))
3002 info['title'] = info['id']
3004 m = re.search('<description>([^<]+)</description>', coursepage)
3006 info['description'] = unescapeHTML(m.group(1))
3008 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3011 'type': 'reference',
3012 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3016 for entry in info['list']:
3017 assert entry['type'] == 'reference'
3018 results += self.extract(entry['url'])
3022 'id': 'Stanford OpenClassroom',
3025 'upload_date': None,
3028 self.report_download_webpage(info['id'])
3029 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3031 rootpage = compat_urllib_request.urlopen(rootURL).read()
3032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3033 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3036 info['title'] = info['id']
3038 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3041 'type': 'reference',
3042 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3047 for entry in info['list']:
3048 assert entry['type'] == 'reference'
3049 results += self.extract(entry['url'])
3052 class MTVIE(InfoExtractor):
3053 """Information extractor for MTV.com"""
3055 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3058 def report_extraction(self, video_id):
3059 """Report information extraction."""
3060 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3062 def _real_extract(self, url):
3063 mobj = re.match(self._VALID_URL, url)
3065 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3067 if not mobj.group('proto'):
3068 url = 'http://' + url
3069 video_id = mobj.group('videoid')
3071 webpage = self._download_webpage(url, video_id)
3073 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3075 self._downloader.trouble(u'ERROR: unable to extract song name')
3077 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3078 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3080 self._downloader.trouble(u'ERROR: unable to extract performer')
3082 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3083 video_title = performer + ' - ' + song_name
3085 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3087 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3089 mtvn_uri = mobj.group(1)
3091 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3093 self._downloader.trouble(u'ERROR: unable to extract content id')
3095 content_id = mobj.group(1)
3097 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3098 self.report_extraction(video_id)
3099 request = compat_urllib_request.Request(videogen_url)
3101 metadataXml = compat_urllib_request.urlopen(request).read()
3102 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3106 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3107 renditions = mdoc.findall('.//rendition')
3109 # For now, always pick the highest quality.
3110 rendition = renditions[-1]
3113 _,_,ext = rendition.attrib['type'].partition('/')
3114 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3115 video_url = rendition.find('./src').text
3117 self._downloader.trouble('Invalid rendition field.')
3123 'uploader': performer,
3124 'upload_date': None,
3125 'title': video_title,
3133 class YoukuIE(InfoExtractor):
3134 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3136 def report_download_webpage(self, file_id):
3137 """Report webpage download."""
3138 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3140 def report_extraction(self, file_id):
3141 """Report information extraction."""
3142 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3145 nowTime = int(time.time() * 1000)
3146 random1 = random.randint(1000,1998)
3147 random2 = random.randint(1000,9999)
3149 return "%d%d%d" %(nowTime,random1,random2)
3151 def _get_file_ID_mix_string(self, seed):
3153 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3155 for i in range(len(source)):
3156 seed = (seed * 211 + 30031 ) % 65536
3157 index = math.floor(seed / 65536 * len(source) )
3158 mixed.append(source[int(index)])
3159 source.remove(source[int(index)])
3160 #return ''.join(mixed)
3163 def _get_file_id(self, fileId, seed):
3164 mixed = self._get_file_ID_mix_string(seed)
3165 ids = fileId.split('*')
3169 realId.append(mixed[int(ch)])
3170 return ''.join(realId)
3172 def _real_extract(self, url):
3173 mobj = re.match(self._VALID_URL, url)
3175 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3177 video_id = mobj.group('ID')
3179 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3181 request = compat_urllib_request.Request(info_url, None, std_headers)
3183 self.report_download_webpage(video_id)
3184 jsondata = compat_urllib_request.urlopen(request).read()
3185 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3186 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3189 self.report_extraction(video_id)
3191 jsonstr = jsondata.decode('utf-8')
3192 config = json.loads(jsonstr)
3194 video_title = config['data'][0]['title']
3195 seed = config['data'][0]['seed']
3197 format = self._downloader.params.get('format', None)
3198 supported_format = list(config['data'][0]['streamfileids'].keys())
3200 if format is None or format == 'best':
3201 if 'hd2' in supported_format:
3206 elif format == 'worst':
3214 fileid = config['data'][0]['streamfileids'][format]
3215 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3216 except (UnicodeDecodeError, ValueError, KeyError):
3217 self._downloader.trouble(u'ERROR: unable to extract info section')
3221 sid = self._gen_sid()
3222 fileid = self._get_file_id(fileid, seed)
3224 #column 8,9 of fileid represent the segment number
3225 #fileid[7:9] should be changed
3226 for index, key in enumerate(keys):
3228 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3229 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3232 'id': '%s_part%02d' % (video_id, index),
3233 'url': download_url,
3235 'upload_date': None,
3236 'title': video_title,
3239 files_info.append(info)
3244 class XNXXIE(InfoExtractor):
3245 """Information extractor for xnxx.com"""
3247 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3249 VIDEO_URL_RE = r'flv_url=(.*?)&'
3250 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3251 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3253 def report_webpage(self, video_id):
3254 """Report information extraction"""
3255 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3257 def report_extraction(self, video_id):
3258 """Report information extraction"""
3259 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3261 def _real_extract(self, url):
3262 mobj = re.match(self._VALID_URL, url)
3264 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3266 video_id = mobj.group(1)
3268 self.report_webpage(video_id)
3270 # Get webpage content
3272 webpage_bytes = compat_urllib_request.urlopen(url).read()
3273 webpage = webpage_bytes.decode('utf-8')
3274 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3275 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3278 result = re.search(self.VIDEO_URL_RE, webpage)
3280 self._downloader.trouble(u'ERROR: unable to extract video url')
3282 video_url = compat_urllib_parse.unquote(result.group(1))
3284 result = re.search(self.VIDEO_TITLE_RE, webpage)
3286 self._downloader.trouble(u'ERROR: unable to extract video title')
3288 video_title = result.group(1)
3290 result = re.search(self.VIDEO_THUMB_RE, webpage)
3292 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3294 video_thumbnail = result.group(1)
3300 'upload_date': None,
3301 'title': video_title,
3303 'thumbnail': video_thumbnail,
3304 'description': None,
3308 class GooglePlusIE(InfoExtractor):
3309 """Information extractor for plus.google.com."""
3311 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3312 IE_NAME = u'plus.google'
3314 def __init__(self, downloader=None):
3315 InfoExtractor.__init__(self, downloader)
3317 def report_extract_entry(self, url):
3318 """Report downloading extry"""
3319 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3321 def report_date(self, upload_date):
3322 """Report downloading extry"""
3323 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3325 def report_uploader(self, uploader):
3326 """Report downloading extry"""
3327 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3329 def report_title(self, video_title):
3330 """Report downloading extry"""
3331 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3333 def report_extract_vid_page(self, video_page):
3334 """Report information extraction."""
3335 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3337 def _real_extract(self, url):
3338 # Extract id from URL
3339 mobj = re.match(self._VALID_URL, url)
3341 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3344 post_url = mobj.group(0)
3345 video_id = mobj.group(1)
3347 video_extension = 'flv'
3349 # Step 1, Retrieve post webpage to extract further information
3350 self.report_extract_entry(post_url)
3351 request = compat_urllib_request.Request(post_url)
3353 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3355 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3358 # Extract update date
3360 pattern = 'title="Timestamp">(.*?)</a>'
3361 mobj = re.search(pattern, webpage)
3363 upload_date = mobj.group(1)
3364 # Convert timestring to a format suitable for filename
3365 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3366 upload_date = upload_date.strftime('%Y%m%d')
3367 self.report_date(upload_date)
3371 pattern = r'rel\="author".*?>(.*?)</a>'
3372 mobj = re.search(pattern, webpage)
3374 uploader = mobj.group(1)
3375 self.report_uploader(uploader)
3378 # Get the first line for title
3380 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3381 mobj = re.search(pattern, webpage)
3383 video_title = mobj.group(1)
3384 self.report_title(video_title)
3386 # Step 2, Stimulate clicking the image box to launch video
3387 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3388 mobj = re.search(pattern, webpage)
3390 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3392 video_page = mobj.group(1)
3393 request = compat_urllib_request.Request(video_page)
3395 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3397 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3399 self.report_extract_vid_page(video_page)
3402 # Extract video links on video page
3403 """Extract video links of all sizes"""
3404 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3405 mobj = re.findall(pattern, webpage)
3407 self._downloader.trouble(u'ERROR: unable to extract video links')
3409 # Sort in resolution
3410 links = sorted(mobj)
3412 # Choose the lowest of the sort, i.e. highest resolution
3413 video_url = links[-1]
3414 # Only get the url. The resolution part in the tuple has no use anymore
3415 video_url = video_url[-1]
3416 # Treat escaped \u0026 style hex
3418 video_url = video_url.decode("unicode_escape")
3419 except AttributeError: # Python 3
3420 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3426 'uploader': uploader,
3427 'upload_date': upload_date,
3428 'title': video_title,
3429 'ext': video_extension,
3432 class NBAIE(InfoExtractor):
3433 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3436 def _real_extract(self, url):
3437 mobj = re.match(self._VALID_URL, url)
3439 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3442 video_id = mobj.group(1)
3443 if video_id.endswith('/index.html'):
3444 video_id = video_id[:-len('/index.html')]
3446 webpage = self._download_webpage(url, video_id)
3448 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3449 def _findProp(rexp, default=None):
3450 m = re.search(rexp, webpage)
3452 return unescapeHTML(m.group(1))
3456 shortened_video_id = video_id.rpartition('/')[2]
3457 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3459 'id': shortened_video_id,
3463 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3464 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3468 class JustinTVIE(InfoExtractor):
3469 """Information extractor for justin.tv and twitch.tv"""
3470 # TODO: One broadcast may be split into multiple videos. The key
3471 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3472 # starts at 1 and increases. Can we treat all parts as one video?
3474 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3475 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3476 _JUSTIN_PAGE_LIMIT = 100
3477 IE_NAME = u'justin.tv'
3479 def report_extraction(self, file_id):
3480 """Report information extraction."""
3481 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3483 def report_download_page(self, channel, offset):
3484 """Report attempt to download a single page of videos."""
3485 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3486 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3488 # Return count of items, list of *valid* items
3489 def _parse_page(self, url):
3491 urlh = compat_urllib_request.urlopen(url)
3492 webpage_bytes = urlh.read()
3493 webpage = webpage_bytes.decode('utf-8', 'ignore')
3494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3495 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3498 response = json.loads(webpage)
3499 if type(response) != list:
3500 error_text = response.get('error', 'unknown error')
3501 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3504 for clip in response:
3505 video_url = clip['video_file_url']
3507 video_extension = os.path.splitext(video_url)[1][1:]
3508 video_date = re.sub('-', '', clip['start_time'][:10])
3509 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3510 video_id = clip['id']
3511 video_title = clip.get('title', video_id)
3515 'title': video_title,
3516 'uploader': clip.get('channel_name', video_uploader_id),
3517 'uploader_id': video_uploader_id,
3518 'upload_date': video_date,
3519 'ext': video_extension,
3521 return (len(response), info)
3523 def _real_extract(self, url):
3524 mobj = re.match(self._VALID_URL, url)
3526 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3529 api = 'http://api.justin.tv'
3530 video_id = mobj.group(mobj.lastindex)
3532 if mobj.lastindex == 1:
3534 api += '/channel/archives/%s.json'
3536 api += '/broadcast/by_archive/%s.json'
3537 api = api % (video_id,)
3539 self.report_extraction(video_id)
3543 limit = self._JUSTIN_PAGE_LIMIT
3546 self.report_download_page(video_id, offset)
3547 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3548 page_count, page_info = self._parse_page(page_url)
3549 info.extend(page_info)
3550 if not paged or page_count != limit:
3555 class FunnyOrDieIE(InfoExtractor):
3556 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3558 def _real_extract(self, url):
3559 mobj = re.match(self._VALID_URL, url)
3561 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3564 video_id = mobj.group('id')
3565 webpage = self._download_webpage(url, video_id)
3567 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3569 self._downloader.trouble(u'ERROR: unable to find video information')
3570 video_url = unescapeHTML(m.group('url'))
3572 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3574 self._downloader.trouble(u'Cannot find video title')
3575 title = unescapeHTML(m.group('title'))
3577 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3579 desc = unescapeHTML(m.group('desc'))
3588 'description': desc,
3592 class TweetReelIE(InfoExtractor):
3593 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3595 def _real_extract(self, url):
3596 mobj = re.match(self._VALID_URL, url)
3598 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3601 video_id = mobj.group('id')
3602 webpage = self._download_webpage(url, video_id)
3604 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3606 self._downloader.trouble(u'ERROR: Cannot find status ID')
3607 status_id = m.group(1)
3609 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3611 self._downloader.trouble(u'WARNING: Cannot find description')
3612 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3614 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3616 self._downloader.trouble(u'ERROR: Cannot find uploader')
3617 uploader = unescapeHTML(m.group('uploader'))
3618 uploader_id = unescapeHTML(m.group('uploader_id'))
3620 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3622 self._downloader.trouble(u'ERROR: Cannot find upload date')
3623 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3626 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3633 'description': desc,
3634 'uploader': uploader,
3635 'uploader_id': uploader_id,
3636 'internal_id': status_id,
3637 'upload_date': upload_date
3641 class SteamIE(InfoExtractor):
3642 _VALID_URL = r"""http://store.steampowered.com/
3643 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3645 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3648 def suitable(self, url):
3649 """Receives a URL and returns True if suitable for this IE."""
3650 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3652 def _real_extract(self, url):
3653 m = re.match(self._VALID_URL, url, re.VERBOSE)
3654 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3655 gameID = m.group('gameID')
3656 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3657 webpage = self._download_webpage(videourl, gameID)
3658 mweb = re.finditer(urlRE, webpage)
3659 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3660 titles = re.finditer(namesRE, webpage)
3662 for vid,vtitle in zip(mweb,titles):
3663 video_id = vid.group('videoID')
3664 title = vtitle.group('videoName')
3665 video_url = vid.group('videoURL')
3667 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3672 'title': unescapeHTML(title)
3677 class UstreamIE(InfoExtractor):
3678 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3679 IE_NAME = u'ustream'
3681 def _real_extract(self, url):
3682 m = re.match(self._VALID_URL, url)
3683 video_id = m.group('videoID')
3684 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3685 webpage = self._download_webpage(url, video_id)
3686 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3687 title = m.group('title')
3688 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3689 uploader = m.group('uploader')
3695 'uploader': uploader
3699 class RBMARadioIE(InfoExtractor):
3700 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3702 def _real_extract(self, url):
3703 m = re.match(self._VALID_URL, url)
3704 video_id = m.group('videoID')
3706 webpage = self._download_webpage(url, video_id)
3707 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3709 raise ExtractorError(u'Cannot find metadata')
3710 json_data = m.group(1)
3713 data = json.loads(json_data)
3714 except ValueError as e:
3715 raise ExtractorError(u'Invalid JSON: ' + str(e))
3717 video_url = data['akamai_url'] + '&cbr=256'
3718 url_parts = compat_urllib_parse_urlparse(video_url)
3719 video_ext = url_parts.path.rpartition('.')[2]
3724 'title': data['title'],
3725 'description': data.get('teaser_text'),
3726 'location': data.get('country_of_origin'),
3727 'uploader': data.get('host', {}).get('name'),
3728 'uploader_id': data.get('host', {}).get('slug'),
3729 'thumbnail': data.get('image', {}).get('large_url_2x'),
3730 'duration': data.get('duration'),
3735 class YouPornIE(InfoExtractor):
3736 """Information extractor for youporn.com."""
3737 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3739 def _print_formats(self, formats):
3740 """Print all available formats"""
3741 print(u'Available formats:')
3742 print(u'ext\t\tformat')
3743 print(u'---------------------------------')
3744 for format in formats:
3745 print(u'%s\t\t%s' % (format['ext'], format['format']))
3747 def _specific(self, req_format, formats):
3749 if(x["format"]==req_format):
3753 def _real_extract(self, url):
3754 mobj = re.match(self._VALID_URL, url)
3756 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3759 video_id = mobj.group('videoid')
3761 req = compat_urllib_request.Request(url)
3762 req.add_header('Cookie', 'age_verified=1')
3763 webpage = self._download_webpage(req, video_id)
3765 # Get the video title
3766 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3768 raise ExtractorError(u'Unable to extract video title')
3769 video_title = result.group('title').strip()
3771 # Get the video date
3772 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3774 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3777 upload_date = result.group('date').strip()
3779 # Get the video uploader
3780 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3782 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3783 video_uploader = None
3785 video_uploader = result.group('uploader').strip()
3786 video_uploader = clean_html( video_uploader )
3788 # Get all of the formats available
3789 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3790 result = re.search(DOWNLOAD_LIST_RE, webpage)
3792 raise ExtractorError(u'Unable to extract download list')
3793 download_list_html = result.group('download_list').strip()
3795 # Get all of the links from the page
3796 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3797 links = re.findall(LINK_RE, download_list_html)
3798 if(len(links) == 0):
3799 raise ExtractorError(u'ERROR: no known formats available for video')
3801 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3806 # A link looks like this:
3807 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3808 # A path looks like this:
3809 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3810 video_url = unescapeHTML( link )
3811 path = compat_urllib_parse_urlparse( video_url ).path
3812 extension = os.path.splitext( path )[1][1:]
3813 format = path.split('/')[4].split('_')[:2]
3816 format = "-".join( format )
3817 title = u'%s-%s-%s' % (video_title, size, bitrate)
3822 'uploader': video_uploader,
3823 'upload_date': upload_date,
3828 'description': None,
3832 if self._downloader.params.get('listformats', None):
3833 self._print_formats(formats)
3836 req_format = self._downloader.params.get('format', None)
3837 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3839 if req_format is None or req_format == 'best':
3841 elif req_format == 'worst':
3842 return [formats[-1]]
3843 elif req_format in ('-1', 'all'):
3846 format = self._specific( req_format, formats )
3848 self._downloader.trouble(u'ERROR: requested format not available')
3854 class PornotubeIE(InfoExtractor):
3855 """Information extractor for pornotube.com."""
3856 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3858 def _real_extract(self, url):
3859 mobj = re.match(self._VALID_URL, url)
3861 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3864 video_id = mobj.group('videoid')
3865 video_title = mobj.group('title')
3867 # Get webpage content
3868 webpage = self._download_webpage(url, video_id)
3871 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3872 result = re.search(VIDEO_URL_RE, webpage)
3874 self._downloader.trouble(u'ERROR: unable to extract video url')
3876 video_url = compat_urllib_parse.unquote(result.group('url'))
3878 #Get the uploaded date
3879 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3880 result = re.search(VIDEO_UPLOADED_RE, webpage)
3882 self._downloader.trouble(u'ERROR: unable to extract video title')
3884 upload_date = result.group('date')
3886 info = {'id': video_id,
3889 'upload_date': upload_date,
3890 'title': video_title,
3896 class YouJizzIE(InfoExtractor):
3897 """Information extractor for youjizz.com."""
3898 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3900 def _real_extract(self, url):
3901 mobj = re.match(self._VALID_URL, url)
3903 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3906 video_id = mobj.group('videoid')
3908 # Get webpage content
3909 webpage = self._download_webpage(url, video_id)
3911 # Get the video title
3912 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3914 raise ExtractorError(u'ERROR: unable to extract video title')
3915 video_title = result.group('title').strip()
3917 # Get the embed page
3918 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3920 raise ExtractorError(u'ERROR: unable to extract embed page')
3922 embed_page_url = result.group(0).strip()
3923 video_id = result.group('videoid')
3925 webpage = self._download_webpage(embed_page_url, video_id)
3928 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3930 raise ExtractorError(u'ERROR: unable to extract video url')
3931 video_url = result.group('source')
3933 info = {'id': video_id,
3935 'title': video_title,
3938 'player_url': embed_page_url}
3942 class EightTracksIE(InfoExtractor):
3944 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3946 def _real_extract(self, url):
3947 mobj = re.match(self._VALID_URL, url)
3949 raise ExtractorError(u'Invalid URL: %s' % url)
3950 playlist_id = mobj.group('id')
3952 webpage = self._download_webpage(url, playlist_id)
3954 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3956 raise ExtractorError(u'Cannot find trax information')
3957 json_like = m.group(1)
3958 data = json.loads(json_like)
3960 session = str(random.randint(0, 1000000000))
3962 track_count = data['tracks_count']
3963 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3964 next_url = first_url
3966 for i in itertools.count():
3967 api_json = self._download_webpage(next_url, playlist_id,
3968 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3969 errnote=u'Failed to download song information')
3970 api_data = json.loads(api_json)
3971 track_data = api_data[u'set']['track']
3973 'id': track_data['id'],
3974 'url': track_data['track_file_stream_url'],
3975 'title': track_data['performer'] + u' - ' + track_data['name'],
3976 'raw_title': track_data['name'],
3977 'uploader_id': data['user']['login'],
3981 if api_data['set']['at_last_track']:
3983 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3986 class KeekIE(InfoExtractor):
3987 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3990 def _real_extract(self, url):
3991 m = re.match(self._VALID_URL, url)
3992 video_id = m.group('videoID')
3993 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3994 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3995 webpage = self._download_webpage(url, video_id)
3996 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3997 title = unescapeHTML(m.group('title'))
3998 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3999 uploader = unescapeHTML(m.group('uploader'))
4005 'thumbnail': thumbnail,
4006 'uploader': uploader
4010 class TEDIE(InfoExtractor):
4011 _VALID_URL=r'''http://www.ted.com/
4013 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4015 ((?P<type_talk>talks)) # We have a simple talk
4017 /(?P<name>\w+) # Here goes the name and then ".html"
4020 def suitable(self, url):
4021 """Receives a URL and returns True if suitable for this IE."""
4022 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4024 def _real_extract(self, url):
4025 m=re.match(self._VALID_URL, url, re.VERBOSE)
4026 if m.group('type_talk'):
4027 return [self._talk_info(url)]
4029 playlist_id=m.group('playlist_id')
4030 name=m.group('name')
4031 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4032 return self._playlist_videos_info(url,name,playlist_id)
4034 def _talk_video_link(self,mediaSlug):
4035 '''Returns the video link for that mediaSlug'''
4036 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4038 def _playlist_videos_info(self,url,name,playlist_id=0):
4039 '''Returns the videos of the playlist'''
4041 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4042 ([.\s]*?)data-playlist_item_id="(\d+)"
4043 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4045 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4046 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4047 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4048 m_names=re.finditer(video_name_RE,webpage)
4050 for m_video, m_name in zip(m_videos,m_names):
4052 'id': m_video.group('video_id'),
4053 'url': self._talk_video_link(m_video.group('mediaSlug')),
4055 'title': m_name.group('fullname')
4057 info.append(video_dic)
4059 def _talk_info(self, url, video_id=0):
4060 """Return the video for the talk in the url"""
4061 m=re.match(self._VALID_URL, url,re.VERBOSE)
4062 videoName=m.group('name')
4063 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4064 # If the url includes the language we get the title translated
4065 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4066 title=re.search(title_RE, webpage).group('title')
4067 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4068 "id":(?P<videoID>[\d]+).*?
4069 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4070 info_match=re.search(info_RE,webpage,re.VERBOSE)
4071 video_id=info_match.group('videoID')
4072 mediaSlug=info_match.group('mediaSlug')
4073 video_url=self._talk_video_link(mediaSlug)
4082 class MySpassIE(InfoExtractor):
4083 _VALID_URL = r'http://www.myspass.de/.*'
4085 def _real_extract(self, url):
4086 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4088 # video id is the last path element of the URL
4089 # usually there is a trailing slash, so also try the second but last
4090 url_path = compat_urllib_parse_urlparse(url).path
4091 url_parent_path, video_id = os.path.split(url_path)
4093 _, video_id = os.path.split(url_parent_path)
4096 metadata_url = META_DATA_URL_TEMPLATE % video_id
4097 metadata_text = self._download_webpage(metadata_url, video_id)
4098 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4100 # extract values from metadata
4101 url_flv_el = metadata.find('url_flv')
4102 if url_flv_el is None:
4103 self._downloader.trouble(u'ERROR: unable to extract download url')
4105 video_url = url_flv_el.text
4106 extension = os.path.splitext(video_url)[1][1:]
4107 title_el = metadata.find('title')
4108 if title_el is None:
4109 self._downloader.trouble(u'ERROR: unable to extract title')
4111 title = title_el.text
4112 format_id_el = metadata.find('format_id')
4113 if format_id_el is None:
4116 format = format_id_el.text
4117 description_el = metadata.find('description')
4118 if description_el is not None:
4119 description = description_el.text
4122 imagePreview_el = metadata.find('imagePreview')
4123 if imagePreview_el is not None:
4124 thumbnail = imagePreview_el.text
4133 'thumbnail': thumbnail,
4134 'description': description
4138 def gen_extractors():
4139 """ Return a list of an instance of every supported extractor.
4140 The order does matter; the first extractor matched is the one handling the URL.
4143 YoutubePlaylistIE(),
4167 StanfordOpenClassroomIE(),