2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
70 self.set_downloader(downloader)
72 def suitable(self, url):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re.match(self._VALID_URL, url) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self._real_initialize()
86 def extract(self, url):
87 """Extracts URL information and returns it in list of dicts."""
89 return self._real_extract(url)
91 def set_downloader(self, downloader):
92 """Sets the downloader for this IE."""
93 self._downloader = downloader
95 def _real_initialize(self):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self, url):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE = 'youtube'
132 # Listed in order of quality
133 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions = {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions = {
165 def suitable(self, url):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
169 def report_lang(self):
170 """Report attempt to set language."""
171 self._downloader.to_screen(u'[youtube] Setting language')
173 def report_login(self):
174 """Report attempt to log in."""
175 self._downloader.to_screen(u'[youtube] Logging in')
177 def report_age_confirmation(self):
178 """Report attempt to confirm age."""
179 self._downloader.to_screen(u'[youtube] Confirming age')
181 def report_video_webpage_download(self, video_id):
182 """Report attempt to download video webpage."""
183 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
185 def report_video_info_webpage_download(self, video_id):
186 """Report attempt to download video info webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
189 def report_video_subtitles_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
193 def report_information_extraction(self, video_id):
194 """Report attempt to extract video information."""
195 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
197 def report_unavailable_format(self, video_id, format):
198 """Report extracted video URL."""
199 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
201 def report_rtmp_download(self):
202 """Indicate the download will use the RTMP protocol."""
203 self._downloader.to_screen(u'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self, xml_string):
207 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
208 # TODO parse xml instead of regex
209 for n, (start, dur_tag, dur, caption) in enumerate(texts):
210 if not dur: dur = '4'
212 end = start + float(dur)
213 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
214 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
215 caption = unescapeHTML(caption)
216 caption = unescapeHTML(caption) # double cycle, intentional
217 srt += str(n+1) + '\n'
218 srt += start + ' --> ' + end + '\n'
219 srt += caption + '\n\n'
222 def _extract_subtitles(self, video_id):
223 self.report_video_subtitles_download(video_id)
224 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
226 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
228 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
229 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
230 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
231 if not srt_lang_list:
232 return (u'WARNING: video has no closed captions', None)
233 if self._downloader.params.get('subtitleslang', False):
234 srt_lang = self._downloader.params.get('subtitleslang')
235 elif 'en' in srt_lang_list:
238 srt_lang = srt_lang_list.keys()[0]
239 if not srt_lang in srt_lang_list:
240 return (u'WARNING: no closed captions found in the specified language', None)
241 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
243 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
245 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
247 return (u'WARNING: unable to download video subtitles', None)
248 return (None, self._closed_captions_xml_to_srt(srt_xml))
250 def _print_formats(self, formats):
251 print('Available formats:')
253 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
255 def _real_initialize(self):
256 if self._downloader is None:
261 downloader_params = self._downloader.params
263 # Attempt to use provided username and password or .netrc data
264 if downloader_params.get('username', None) is not None:
265 username = downloader_params['username']
266 password = downloader_params['password']
267 elif downloader_params.get('usenetrc', False):
269 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
274 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
275 except (IOError, netrc.NetrcParseError) as err:
276 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
280 request = compat_urllib_request.Request(self._LANG_URL)
283 compat_urllib_request.urlopen(request).read()
284 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
285 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
288 # No authentication to be performed
294 'current_form': 'loginForm',
296 'action_login': 'Log In',
297 'username': username,
298 'password': password,
300 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
303 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
305 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
308 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
314 'action_confirm': 'Confirm',
316 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
318 self.report_age_confirmation()
319 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
320 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
321 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
324 def _real_extract(self, url):
325 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
326 mobj = re.search(self._NEXT_URL_RE, url)
328 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
330 # Extract video id from URL
331 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
333 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
335 video_id = mobj.group(2)
338 self.report_video_webpage_download(video_id)
339 request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
341 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
346 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
348 # Attempt to extract SWF player URL
349 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
351 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
356 self.report_video_info_webpage_download(video_id)
357 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
358 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
359 % (video_id, el_type))
360 request = compat_urllib_request.Request(video_info_url)
362 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
363 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
364 video_info = compat_parse_qs(video_info_webpage)
365 if 'token' in video_info:
367 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
368 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
370 if 'token' not in video_info:
371 if 'reason' in video_info:
372 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
374 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
377 # Check for "rental" videos
378 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
379 self._downloader.trouble(u'ERROR: "rental" videos not supported')
382 # Start extracting information
383 self.report_information_extraction(video_id)
386 if 'author' not in video_info:
387 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
389 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
392 if 'title' not in video_info:
393 self._downloader.trouble(u'ERROR: unable to extract video title')
395 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
398 if 'thumbnail_url' not in video_info:
399 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
401 else: # don't panic if we can't find it
402 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
406 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
408 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
409 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
410 for expression in format_expressions:
412 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
417 video_description = get_element_by_id("eow-description", video_webpage)
418 if video_description:
419 video_description = clean_html(video_description)
421 video_description = ''
424 video_subtitles = None
425 if self._downloader.params.get('writesubtitles', False):
426 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
428 self._downloader.trouble(srt_error)
430 if 'length_seconds' not in video_info:
431 self._downloader.trouble(u'WARNING: unable to extract video duration')
434 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
437 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
439 # Decide which formats to download
440 req_format = self._downloader.params.get('format', None)
442 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
443 self.report_rtmp_download()
444 video_url_list = [(None, video_info['conn'][0])]
445 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
446 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
447 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
448 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
449 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
451 format_limit = self._downloader.params.get('format_limit', None)
452 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
453 if format_limit is not None and format_limit in available_formats:
454 format_list = available_formats[available_formats.index(format_limit):]
456 format_list = available_formats
457 existing_formats = [x for x in format_list if x in url_map]
458 if len(existing_formats) == 0:
459 self._downloader.trouble(u'ERROR: no known formats available for video')
461 if self._downloader.params.get('listformats', None):
462 self._print_formats(existing_formats)
464 if req_format is None or req_format == 'best':
465 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
466 elif req_format == 'worst':
467 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
468 elif req_format in ('-1', 'all'):
469 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
471 # Specific formats. We pick the first in a slash-delimeted sequence.
472 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
473 req_formats = req_format.split('/')
474 video_url_list = None
475 for rf in req_formats:
477 video_url_list = [(rf, url_map[rf])]
479 if video_url_list is None:
480 self._downloader.trouble(u'ERROR: requested format not available')
483 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
487 for format_param, video_real_url in video_url_list:
489 video_extension = self._video_extensions.get(format_param, 'flv')
491 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
492 self._video_dimensions.get(format_param, '???'))
496 'url': video_real_url,
497 'uploader': video_uploader,
498 'upload_date': upload_date,
499 'title': video_title,
500 'ext': video_extension,
501 'format': video_format,
502 'thumbnail': video_thumbnail,
503 'description': video_description,
504 'player_url': player_url,
505 'subtitles': video_subtitles,
506 'duration': video_duration
511 class MetacafeIE(InfoExtractor):
512 """Information Extractor for metacafe.com."""
514 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
515 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
516 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
517 IE_NAME = u'metacafe'
519 def __init__(self, downloader=None):
520 InfoExtractor.__init__(self, downloader)
522 def report_disclaimer(self):
523 """Report disclaimer retrieval."""
524 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
526 def report_age_confirmation(self):
527 """Report attempt to confirm age."""
528 self._downloader.to_screen(u'[metacafe] Confirming age')
530 def report_download_webpage(self, video_id):
531 """Report webpage download."""
532 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
534 def report_extraction(self, video_id):
535 """Report information extraction."""
536 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
538 def _real_initialize(self):
539 # Retrieve disclaimer
540 request = compat_urllib_request.Request(self._DISCLAIMER)
542 self.report_disclaimer()
543 disclaimer = compat_urllib_request.urlopen(request).read()
544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
551 'submit': "Continue - I'm over 18",
553 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
555 self.report_age_confirmation()
556 disclaimer = compat_urllib_request.urlopen(request).read()
557 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
558 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
561 def _real_extract(self, url):
562 # Extract id and simplified title from URL
563 mobj = re.match(self._VALID_URL, url)
565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
568 video_id = mobj.group(1)
570 # Check if video comes from YouTube
571 mobj2 = re.match(r'^yt-(.*)$', video_id)
572 if mobj2 is not None:
573 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
576 # Retrieve video webpage to extract further information
577 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
579 self.report_download_webpage(video_id)
580 webpage = compat_urllib_request.urlopen(request).read()
581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
582 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
585 # Extract URL, uploader and title from webpage
586 self.report_extraction(video_id)
587 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
589 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
590 video_extension = mediaURL[-3:]
592 # Extract gdaKey if available
593 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
597 gdaKey = mobj.group(1)
598 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
600 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
602 self._downloader.trouble(u'ERROR: unable to extract media URL')
604 vardict = compat_parse_qs(mobj.group(1))
605 if 'mediaData' not in vardict:
606 self._downloader.trouble(u'ERROR: unable to extract media URL')
608 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
610 self._downloader.trouble(u'ERROR: unable to extract media URL')
612 mediaURL = mobj.group(1).replace('\\/', '/')
613 video_extension = mediaURL[-3:]
614 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
616 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract title')
620 video_title = mobj.group(1).decode('utf-8')
622 mobj = re.search(r'submitter=(.*?);', webpage)
624 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
626 video_uploader = mobj.group(1)
629 'id': video_id.decode('utf-8'),
630 'url': video_url.decode('utf-8'),
631 'uploader': video_uploader.decode('utf-8'),
633 'title': video_title,
634 'ext': video_extension.decode('utf-8'),
638 class DailymotionIE(InfoExtractor):
639 """Information Extractor for Dailymotion"""
641 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
642 IE_NAME = u'dailymotion'
644 def __init__(self, downloader=None):
645 InfoExtractor.__init__(self, downloader)
647 def report_download_webpage(self, video_id):
648 """Report webpage download."""
649 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
651 def report_extraction(self, video_id):
652 """Report information extraction."""
653 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
655 def _real_extract(self, url):
656 # Extract id and simplified title from URL
657 mobj = re.match(self._VALID_URL, url)
659 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
662 video_id = mobj.group(1).split('_')[0].split('?')[0]
664 video_extension = 'mp4'
666 # Retrieve video webpage to extract further information
667 request = compat_urllib_request.Request(url)
668 request.add_header('Cookie', 'family_filter=off')
670 self.report_download_webpage(video_id)
671 webpage_bytes = compat_urllib_request.urlopen(request).read()
672 webpage = webpage_bytes.decode('utf-8')
673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
674 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
677 # Extract URL, uploader and title from webpage
678 self.report_extraction(video_id)
679 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
681 self._downloader.trouble(u'ERROR: unable to extract media URL')
683 flashvars = compat_urllib_parse.unquote(mobj.group(1))
685 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
688 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
691 self._downloader.trouble(u'ERROR: unable to extract video URL')
694 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
696 self._downloader.trouble(u'ERROR: unable to extract video URL')
699 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
701 # TODO: support choosing qualities
703 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
705 self._downloader.trouble(u'ERROR: unable to extract title')
707 video_title = unescapeHTML(mobj.group('title'))
709 video_uploader = None
710 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
712 # lookin for official user
713 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
714 if mobj_official is None:
715 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
717 video_uploader = mobj_official.group(1)
719 video_uploader = mobj.group(1)
721 video_upload_date = None
722 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
724 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
729 'uploader': video_uploader,
730 'upload_date': video_upload_date,
731 'title': video_title,
732 'ext': video_extension,
736 class PhotobucketIE(InfoExtractor):
737 """Information extractor for photobucket.com."""
739 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
740 IE_NAME = u'photobucket'
742 def __init__(self, downloader=None):
743 InfoExtractor.__init__(self, downloader)
745 def report_download_webpage(self, video_id):
746 """Report webpage download."""
747 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
749 def report_extraction(self, video_id):
750 """Report information extraction."""
751 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
753 def _real_extract(self, url):
754 # Extract id from URL
755 mobj = re.match(self._VALID_URL, url)
757 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
760 video_id = mobj.group(1)
762 video_extension = 'flv'
764 # Retrieve video webpage to extract further information
765 request = compat_urllib_request.Request(url)
767 self.report_download_webpage(video_id)
768 webpage = compat_urllib_request.urlopen(request).read()
769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
770 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
773 # Extract URL, uploader, and title from webpage
774 self.report_extraction(video_id)
775 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
777 self._downloader.trouble(u'ERROR: unable to extract media URL')
779 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
783 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
785 self._downloader.trouble(u'ERROR: unable to extract title')
787 video_title = mobj.group(1).decode('utf-8')
789 video_uploader = mobj.group(2).decode('utf-8')
792 'id': video_id.decode('utf-8'),
793 'url': video_url.decode('utf-8'),
794 'uploader': video_uploader,
796 'title': video_title,
797 'ext': video_extension.decode('utf-8'),
801 class YahooIE(InfoExtractor):
802 """Information extractor for video.yahoo.com."""
805 # _VALID_URL matches all Yahoo! Video URLs
806 # _VPAGE_URL matches only the extractable '/watch/' URLs
807 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
808 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
809 IE_NAME = u'video.yahoo'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
822 def _real_extract(self, url, new_video=True):
823 # Extract ID from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(2)
830 video_extension = 'flv'
832 # Rewrite valid but non-extractable URLs as
833 # extractable English language /watch/ URLs
834 if re.match(self._VPAGE_URL, url) is None:
835 request = compat_urllib_request.Request(url)
837 webpage = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
842 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
844 self._downloader.trouble(u'ERROR: Unable to extract id field')
846 yahoo_id = mobj.group(1)
848 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
850 self._downloader.trouble(u'ERROR: Unable to extract vid field')
852 yahoo_vid = mobj.group(1)
854 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
855 return self._real_extract(url, new_video=False)
857 # Retrieve video webpage to extract further information
858 request = compat_urllib_request.Request(url)
860 self.report_download_webpage(video_id)
861 webpage = compat_urllib_request.urlopen(request).read()
862 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
863 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
866 # Extract uploader and title from webpage
867 self.report_extraction(video_id)
868 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
870 self._downloader.trouble(u'ERROR: unable to extract video title')
872 video_title = mobj.group(1).decode('utf-8')
874 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
876 self._downloader.trouble(u'ERROR: unable to extract video uploader')
878 video_uploader = mobj.group(1).decode('utf-8')
880 # Extract video thumbnail
881 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
883 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
885 video_thumbnail = mobj.group(1).decode('utf-8')
887 # Extract video description
888 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
890 self._downloader.trouble(u'ERROR: unable to extract video description')
892 video_description = mobj.group(1).decode('utf-8')
893 if not video_description:
894 video_description = 'No description available.'
896 # Extract video height and width
897 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract video height')
901 yv_video_height = mobj.group(1)
903 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
905 self._downloader.trouble(u'ERROR: unable to extract video width')
907 yv_video_width = mobj.group(1)
909 # Retrieve video playlist to extract media URL
910 # I'm not completely sure what all these options are, but we
911 # seem to need most of them, otherwise the server sends a 401.
912 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
913 yv_bitrate = '700' # according to Wikipedia this is hard-coded
914 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
915 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
916 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
918 self.report_download_webpage(video_id)
919 webpage = compat_urllib_request.urlopen(request).read()
920 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
921 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
924 # Extract media URL from playlist XML
925 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
927 self._downloader.trouble(u'ERROR: Unable to extract media URL')
929 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
930 video_url = unescapeHTML(video_url)
933 'id': video_id.decode('utf-8'),
935 'uploader': video_uploader,
937 'title': video_title,
938 'ext': video_extension.decode('utf-8'),
939 'thumbnail': video_thumbnail.decode('utf-8'),
940 'description': video_description,
944 class VimeoIE(InfoExtractor):
945 """Information extractor for vimeo.com."""
947 # _VALID_URL matches Vimeo URLs
948 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
951 def __init__(self, downloader=None):
952 InfoExtractor.__init__(self, downloader)
954 def report_download_webpage(self, video_id):
955 """Report webpage download."""
956 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
958 def report_extraction(self, video_id):
959 """Report information extraction."""
960 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
962 def _real_extract(self, url, new_video=True):
963 # Extract ID from URL
964 mobj = re.match(self._VALID_URL, url)
966 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
969 video_id = mobj.group(1)
971 # Retrieve video webpage to extract further information
972 request = compat_urllib_request.Request(url, None, std_headers)
974 self.report_download_webpage(video_id)
975 webpage_bytes = compat_urllib_request.urlopen(request).read()
976 webpage = webpage_bytes.decode('utf-8')
977 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
978 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
981 # Now we begin extracting as much information as we can from what we
982 # retrieved. First we extract the information common to all extractors,
983 # and latter we extract those that are Vimeo specific.
984 self.report_extraction(video_id)
986 # Extract the config JSON
988 config = webpage.split(' = {config:')[1].split(',assets:')[0]
989 config = json.loads(config)
991 self._downloader.trouble(u'ERROR: unable to extract info section')
995 video_title = config["video"]["title"]
998 video_uploader = config["video"]["owner"]["name"]
1000 # Extract video thumbnail
1001 video_thumbnail = config["video"]["thumbnail"]
1003 # Extract video description
1004 video_description = get_element_by_attribute("itemprop", "description", webpage)
1005 if video_description: video_description = clean_html(video_description)
1006 else: video_description = ''
1008 # Extract upload date
1009 video_upload_date = None
1010 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1011 if mobj is not None:
1012 video_upload_date = mobj.group(1)
1014 # Vimeo specific: extract request signature and timestamp
1015 sig = config['request']['signature']
1016 timestamp = config['request']['timestamp']
1018 # Vimeo specific: extract video codec and quality information
1019 # First consider quality, then codecs, then take everything
1020 # TODO bind to format param
1021 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1022 files = { 'hd': [], 'sd': [], 'other': []}
1023 for codec_name, codec_extension in codecs:
1024 if codec_name in config["video"]["files"]:
1025 if 'hd' in config["video"]["files"][codec_name]:
1026 files['hd'].append((codec_name, codec_extension, 'hd'))
1027 elif 'sd' in config["video"]["files"][codec_name]:
1028 files['sd'].append((codec_name, codec_extension, 'sd'))
1030 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1032 for quality in ('hd', 'sd', 'other'):
1033 if len(files[quality]) > 0:
1034 video_quality = files[quality][0][2]
1035 video_codec = files[quality][0][0]
1036 video_extension = files[quality][0][1]
1037 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1040 self._downloader.trouble(u'ERROR: no known codec found')
1043 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1044 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1049 'uploader': video_uploader,
1050 'upload_date': video_upload_date,
1051 'title': video_title,
1052 'ext': video_extension,
1053 'thumbnail': video_thumbnail,
1054 'description': video_description,
1058 class ArteTvIE(InfoExtractor):
1059 """arte.tv information extractor."""
1061 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1062 _LIVE_URL = r'index-[0-9]+\.html$'
1064 IE_NAME = u'arte.tv'
1066 def __init__(self, downloader=None):
1067 InfoExtractor.__init__(self, downloader)
1069 def report_download_webpage(self, video_id):
1070 """Report webpage download."""
1071 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1073 def report_extraction(self, video_id):
1074 """Report information extraction."""
1075 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1077 def fetch_webpage(self, url):
1078 self._downloader.increment_downloads()
1079 request = compat_urllib_request.Request(url)
1081 self.report_download_webpage(url)
1082 webpage = compat_urllib_request.urlopen(request).read()
1083 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1084 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1086 except ValueError as err:
1087 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1091 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1092 page = self.fetch_webpage(url)
1093 mobj = re.search(regex, page, regexFlags)
1097 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1100 for (i, key, err) in matchTuples:
1101 if mobj.group(i) is None:
1102 self._downloader.trouble(err)
1105 info[key] = mobj.group(i)
1109 def extractLiveStream(self, url):
1110 video_lang = url.split('/')[-4]
1111 info = self.grep_webpage(
1113 r'src="(.*?/videothek_js.*?\.js)',
1116 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1119 http_host = url.split('/')[2]
1120 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1121 info = self.grep_webpage(
1123 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1124 '(http://.*?\.swf).*?' +
1128 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1129 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1130 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1133 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1135 def extractPlus7Stream(self, url):
1136 video_lang = url.split('/')[-3]
1137 info = self.grep_webpage(
1139 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1142 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1145 next_url = compat_urllib_parse.unquote(info.get('url'))
1146 info = self.grep_webpage(
1148 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1151 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1154 next_url = compat_urllib_parse.unquote(info.get('url'))
1156 info = self.grep_webpage(
1158 r'<video id="(.*?)".*?>.*?' +
1159 '<name>(.*?)</name>.*?' +
1160 '<dateVideo>(.*?)</dateVideo>.*?' +
1161 '<url quality="hd">(.*?)</url>',
1164 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1165 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1166 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1167 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1172 'id': info.get('id'),
1173 'url': compat_urllib_parse.unquote(info.get('url')),
1174 'uploader': u'arte.tv',
1175 'upload_date': info.get('date'),
1176 'title': info.get('title').decode('utf-8'),
1182 def _real_extract(self, url):
1183 video_id = url.split('/')[-1]
1184 self.report_extraction(video_id)
1186 if re.search(self._LIVE_URL, video_id) is not None:
1187 self.extractLiveStream(url)
1190 info = self.extractPlus7Stream(url)
1195 class GenericIE(InfoExtractor):
1196 """Generic last-resort information extractor."""
1199 IE_NAME = u'generic'
1201 def __init__(self, downloader=None):
1202 InfoExtractor.__init__(self, downloader)
1204 def report_download_webpage(self, video_id):
1205 """Report webpage download."""
1206 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1207 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1209 def report_extraction(self, video_id):
1210 """Report information extraction."""
1211 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1213 def report_following_redirect(self, new_url):
1214 """Report information extraction."""
1215 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1217 def _test_redirect(self, url):
1218 """Check if it is a redirect, like url shorteners, in case restart chain."""
1219 class HeadRequest(compat_urllib_request.Request):
1220 def get_method(self):
1223 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1225 Subclass the HTTPRedirectHandler to make it use our
1226 HeadRequest also on the redirected URL
1228 def redirect_request(self, req, fp, code, msg, headers, newurl):
1229 if code in (301, 302, 303, 307):
1230 newurl = newurl.replace(' ', '%20')
1231 newheaders = dict((k,v) for k,v in req.headers.items()
1232 if k.lower() not in ("content-length", "content-type"))
1233 return HeadRequest(newurl,
1235 origin_req_host=req.get_origin_req_host(),
1238 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1240 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1242 Fallback to GET if HEAD is not allowed (405 HTTP error)
1244 def http_error_405(self, req, fp, code, msg, headers):
1248 newheaders = dict((k,v) for k,v in req.headers.items()
1249 if k.lower() not in ("content-length", "content-type"))
1250 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1252 origin_req_host=req.get_origin_req_host(),
1256 opener = compat_urllib_request.OpenerDirector()
1257 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1258 HTTPMethodFallback, HEADRedirectHandler,
1259 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1260 opener.add_handler(handler())
1262 response = opener.open(HeadRequest(url))
1263 new_url = response.geturl()
1268 self.report_following_redirect(new_url)
1269 self._downloader.download([new_url])
1272 def _real_extract(self, url):
1273 if self._test_redirect(url): return
1275 video_id = url.split('/')[-1]
1276 request = compat_urllib_request.Request(url)
1278 self.report_download_webpage(video_id)
1279 webpage = compat_urllib_request.urlopen(request).read()
1280 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1281 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1283 except ValueError as err:
1284 # since this is the last-resort InfoExtractor, if
1285 # this error is thrown, it'll be thrown here
1286 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1289 self.report_extraction(video_id)
1290 # Start with something easy: JW Player in SWFObject
1291 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1293 # Broaden the search a little bit
1294 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1296 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1299 # It's possible that one of the regexes
1300 # matched, but returned an empty group:
1301 if mobj.group(1) is None:
1302 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1305 video_url = compat_urllib_parse.unquote(mobj.group(1))
1306 video_id = os.path.basename(video_url)
1308 # here's a fun little line of code for you:
1309 video_extension = os.path.splitext(video_id)[1][1:]
1310 video_id = os.path.splitext(video_id)[0]
1312 # it's tempting to parse this further, but you would
1313 # have to take into account all the variations like
1314 # Video Title - Site Name
1315 # Site Name | Video Title
1316 # Video Title - Tagline | Site Name
1317 # and so on and so forth; it's just not practical
1318 mobj = re.search(r'<title>(.*)</title>', webpage)
1320 self._downloader.trouble(u'ERROR: unable to extract title')
1322 video_title = mobj.group(1)
1324 # video uploader is domain name
1325 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1327 self._downloader.trouble(u'ERROR: unable to extract title')
1329 video_uploader = mobj.group(1)
1334 'uploader': video_uploader,
1335 'upload_date': None,
1336 'title': video_title,
1337 'ext': video_extension,
1341 class YoutubeSearchIE(InfoExtractor):
1342 """Information Extractor for YouTube search queries."""
1343 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1344 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1345 _max_youtube_results = 1000
1346 IE_NAME = u'youtube:search'
1348 def __init__(self, downloader=None):
1349 InfoExtractor.__init__(self, downloader)
1351 def report_download_page(self, query, pagenum):
1352 """Report attempt to download search page with given number."""
1353 query = query.decode(preferredencoding())
1354 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1356 def _real_extract(self, query):
1357 mobj = re.match(self._VALID_URL, query)
1359 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1362 prefix, query = query.split(':')
1364 query = query.encode('utf-8')
1366 self._download_n_results(query, 1)
1368 elif prefix == 'all':
1369 self._download_n_results(query, self._max_youtube_results)
1375 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1377 elif n > self._max_youtube_results:
1378 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1379 n = self._max_youtube_results
1380 self._download_n_results(query, n)
1382 except ValueError: # parsing prefix as integer fails
1383 self._download_n_results(query, 1)
1386 def _download_n_results(self, query, n):
1387 """Downloads a specified number of results for a query"""
1393 while (50 * pagenum) < limit:
1394 self.report_download_page(query, pagenum+1)
1395 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1396 request = compat_urllib_request.Request(result_url)
1398 data = compat_urllib_request.urlopen(request).read()
1399 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1400 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1402 api_response = json.loads(data)['data']
1404 new_ids = list(video['id'] for video in api_response['items'])
1405 video_ids += new_ids
1407 limit = min(n, api_response['totalItems'])
1410 if len(video_ids) > n:
1411 video_ids = video_ids[:n]
1412 for id in video_ids:
1413 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1417 class GoogleSearchIE(InfoExtractor):
1418 """Information Extractor for Google Video search queries."""
1419 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1420 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1421 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1422 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1423 _max_google_results = 1000
1424 IE_NAME = u'video.google:search'
1426 def __init__(self, downloader=None):
1427 InfoExtractor.__init__(self, downloader)
1429 def report_download_page(self, query, pagenum):
1430 """Report attempt to download playlist page with given number."""
1431 query = query.decode(preferredencoding())
1432 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1434 def _real_extract(self, query):
1435 mobj = re.match(self._VALID_URL, query)
1437 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1440 prefix, query = query.split(':')
1442 query = query.encode('utf-8')
1444 self._download_n_results(query, 1)
1446 elif prefix == 'all':
1447 self._download_n_results(query, self._max_google_results)
1453 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1455 elif n > self._max_google_results:
1456 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1457 n = self._max_google_results
1458 self._download_n_results(query, n)
1460 except ValueError: # parsing prefix as integer fails
1461 self._download_n_results(query, 1)
1464 def _download_n_results(self, query, n):
1465 """Downloads a specified number of results for a query"""
1471 self.report_download_page(query, pagenum)
1472 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1473 request = compat_urllib_request.Request(result_url)
1475 page = compat_urllib_request.urlopen(request).read()
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1480 # Extract video identifiers
1481 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1482 video_id = mobj.group(1)
1483 if video_id not in video_ids:
1484 video_ids.append(video_id)
1485 if len(video_ids) == n:
1486 # Specified n videos reached
1487 for id in video_ids:
1488 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1491 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1492 for id in video_ids:
1493 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1496 pagenum = pagenum + 1
1499 class YahooSearchIE(InfoExtractor):
1500 """Information Extractor for Yahoo! Video search queries."""
1503 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1504 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1505 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1506 _MORE_PAGES_INDICATOR = r'\s*Next'
1507 _max_yahoo_results = 1000
1508 IE_NAME = u'video.yahoo:search'
1510 def __init__(self, downloader=None):
1511 InfoExtractor.__init__(self, downloader)
1513 def report_download_page(self, query, pagenum):
1514 """Report attempt to download playlist page with given number."""
1515 query = query.decode(preferredencoding())
1516 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1518 def _real_extract(self, query):
1519 mobj = re.match(self._VALID_URL, query)
1521 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1524 prefix, query = query.split(':')
1526 query = query.encode('utf-8')
1528 self._download_n_results(query, 1)
1530 elif prefix == 'all':
1531 self._download_n_results(query, self._max_yahoo_results)
1537 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1539 elif n > self._max_yahoo_results:
1540 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1541 n = self._max_yahoo_results
1542 self._download_n_results(query, n)
1544 except ValueError: # parsing prefix as integer fails
1545 self._download_n_results(query, 1)
1548 def _download_n_results(self, query, n):
1549 """Downloads a specified number of results for a query"""
1552 already_seen = set()
1556 self.report_download_page(query, pagenum)
1557 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1558 request = compat_urllib_request.Request(result_url)
1560 page = compat_urllib_request.urlopen(request).read()
1561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1565 # Extract video identifiers
1566 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567 video_id = mobj.group(1)
1568 if video_id not in already_seen:
1569 video_ids.append(video_id)
1570 already_seen.add(video_id)
1571 if len(video_ids) == n:
1572 # Specified n videos reached
1573 for id in video_ids:
1574 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1577 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1578 for id in video_ids:
1579 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1582 pagenum = pagenum + 1
1585 class YoutubePlaylistIE(InfoExtractor):
1586 """Information Extractor for YouTube playlists."""
1588 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1589 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1590 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1591 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1592 IE_NAME = u'youtube:playlist'
1594 def __init__(self, downloader=None):
1595 InfoExtractor.__init__(self, downloader)
1597 def report_download_page(self, playlist_id, pagenum):
1598 """Report attempt to download playlist page with given number."""
1599 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1601 def _real_extract(self, url):
1602 # Extract playlist id
1603 mobj = re.match(self._VALID_URL, url)
1605 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1609 if mobj.group(3) is not None:
1610 self._downloader.download([mobj.group(3)])
1613 # Download playlist pages
1614 # prefix is 'p' as default for playlists but there are other types that need extra care
1615 playlist_prefix = mobj.group(1)
1616 if playlist_prefix == 'a':
1617 playlist_access = 'artist'
1619 playlist_prefix = 'p'
1620 playlist_access = 'view_play_list'
1621 playlist_id = mobj.group(2)
1626 self.report_download_page(playlist_id, pagenum)
1627 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1628 request = compat_urllib_request.Request(url)
1630 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1631 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1632 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1635 # Extract video identifiers
1637 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1638 if mobj.group(1) not in ids_in_page:
1639 ids_in_page.append(mobj.group(1))
1640 video_ids.extend(ids_in_page)
1642 if self._MORE_PAGES_INDICATOR not in page:
1644 pagenum = pagenum + 1
1646 total = len(video_ids)
1648 playliststart = self._downloader.params.get('playliststart', 1) - 1
1649 playlistend = self._downloader.params.get('playlistend', -1)
1650 if playlistend == -1:
1651 video_ids = video_ids[playliststart:]
1653 video_ids = video_ids[playliststart:playlistend]
1655 if len(video_ids) == total:
1656 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1658 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1660 for id in video_ids:
1661 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1665 class YoutubeChannelIE(InfoExtractor):
1666 """Information Extractor for YouTube channels."""
1668 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1669 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1670 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1671 IE_NAME = u'youtube:channel'
1673 def report_download_page(self, channel_id, pagenum):
1674 """Report attempt to download channel page with given number."""
1675 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1677 def _real_extract(self, url):
1678 # Extract channel id
1679 mobj = re.match(self._VALID_URL, url)
1681 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1684 # Download channel pages
1685 channel_id = mobj.group(1)
1690 self.report_download_page(channel_id, pagenum)
1691 url = self._TEMPLATE_URL % (channel_id, pagenum)
1692 request = compat_urllib_request.Request(url)
1694 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1695 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1696 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1699 # Extract video identifiers
1701 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1702 if mobj.group(1) not in ids_in_page:
1703 ids_in_page.append(mobj.group(1))
1704 video_ids.extend(ids_in_page)
1706 if self._MORE_PAGES_INDICATOR not in page:
1708 pagenum = pagenum + 1
1710 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1712 for id in video_ids:
1713 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1717 class YoutubeUserIE(InfoExtractor):
1718 """Information Extractor for YouTube users."""
1720 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1721 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1722 _GDATA_PAGE_SIZE = 50
1723 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1724 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1725 IE_NAME = u'youtube:user'
1727 def __init__(self, downloader=None):
1728 InfoExtractor.__init__(self, downloader)
1730 def report_download_page(self, username, start_index):
1731 """Report attempt to download user page."""
1732 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1733 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1735 def _real_extract(self, url):
1737 mobj = re.match(self._VALID_URL, url)
1739 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1742 username = mobj.group(1)
1744 # Download video ids using YouTube Data API. Result size per
1745 # query is limited (currently to 50 videos) so we need to query
1746 # page by page until there are no video ids - it means we got
1753 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1754 self.report_download_page(username, start_index)
1756 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1759 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1760 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1761 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1764 # Extract video identifiers
1767 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1768 if mobj.group(1) not in ids_in_page:
1769 ids_in_page.append(mobj.group(1))
1771 video_ids.extend(ids_in_page)
1773 # A little optimization - if current page is not
1774 # "full", ie. does not contain PAGE_SIZE video ids then
1775 # we can assume that this page is the last one - there
1776 # are no more ids on further pages - no need to query
1779 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1784 all_ids_count = len(video_ids)
1785 playliststart = self._downloader.params.get('playliststart', 1) - 1
1786 playlistend = self._downloader.params.get('playlistend', -1)
1788 if playlistend == -1:
1789 video_ids = video_ids[playliststart:]
1791 video_ids = video_ids[playliststart:playlistend]
1793 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1794 (username, all_ids_count, len(video_ids)))
1796 for video_id in video_ids:
1797 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1800 class BlipTVUserIE(InfoExtractor):
1801 """Information Extractor for blip.tv users."""
1803 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1805 IE_NAME = u'blip.tv:user'
1807 def __init__(self, downloader=None):
1808 InfoExtractor.__init__(self, downloader)
1810 def report_download_page(self, username, pagenum):
1811 """Report attempt to download user page."""
1812 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1813 (self.IE_NAME, username, pagenum))
1815 def _real_extract(self, url):
1817 mobj = re.match(self._VALID_URL, url)
1819 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1822 username = mobj.group(1)
1824 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1826 request = compat_urllib_request.Request(url)
1829 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1830 mobj = re.search(r'data-users-id="([^"]+)"', page)
1831 page_base = page_base % mobj.group(1)
1832 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1833 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1837 # Download video ids using BlipTV Ajax calls. Result size per
1838 # query is limited (currently to 12 videos) so we need to query
1839 # page by page until there are no video ids - it means we got
1846 self.report_download_page(username, pagenum)
1848 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1851 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1853 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1856 # Extract video identifiers
1859 for mobj in re.finditer(r'href="/([^"]+)"', page):
1860 if mobj.group(1) not in ids_in_page:
1861 ids_in_page.append(unescapeHTML(mobj.group(1)))
1863 video_ids.extend(ids_in_page)
1865 # A little optimization - if current page is not
1866 # "full", ie. does not contain PAGE_SIZE video ids then
1867 # we can assume that this page is the last one - there
1868 # are no more ids on further pages - no need to query
1871 if len(ids_in_page) < self._PAGE_SIZE:
1876 all_ids_count = len(video_ids)
1877 playliststart = self._downloader.params.get('playliststart', 1) - 1
1878 playlistend = self._downloader.params.get('playlistend', -1)
1880 if playlistend == -1:
1881 video_ids = video_ids[playliststart:]
1883 video_ids = video_ids[playliststart:playlistend]
1885 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1886 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1888 for video_id in video_ids:
1889 self._downloader.download([u'http://blip.tv/'+video_id])
1892 class DepositFilesIE(InfoExtractor):
1893 """Information extractor for depositfiles.com"""
1895 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1896 IE_NAME = u'DepositFiles'
1898 def __init__(self, downloader=None):
1899 InfoExtractor.__init__(self, downloader)
1901 def report_download_webpage(self, file_id):
1902 """Report webpage download."""
1903 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1905 def report_extraction(self, file_id):
1906 """Report information extraction."""
1907 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1909 def _real_extract(self, url):
1910 file_id = url.split('/')[-1]
1911 # Rebuild url in english locale
1912 url = 'http://depositfiles.com/en/files/' + file_id
1914 # Retrieve file webpage with 'Free download' button pressed
1915 free_download_indication = { 'gateway_result' : '1' }
1916 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1918 self.report_download_webpage(file_id)
1919 webpage = compat_urllib_request.urlopen(request).read()
1920 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1921 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1924 # Search for the real file URL
1925 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1926 if (mobj is None) or (mobj.group(1) is None):
1927 # Try to figure out reason of the error.
1928 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1929 if (mobj is not None) and (mobj.group(1) is not None):
1930 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1931 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1933 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1936 file_url = mobj.group(1)
1937 file_extension = os.path.splitext(file_url)[1][1:]
1939 # Search for file title
1940 mobj = re.search(r'<b title="(.*?)">', webpage)
1942 self._downloader.trouble(u'ERROR: unable to extract title')
1944 file_title = mobj.group(1).decode('utf-8')
1947 'id': file_id.decode('utf-8'),
1948 'url': file_url.decode('utf-8'),
1950 'upload_date': None,
1951 'title': file_title,
1952 'ext': file_extension.decode('utf-8'),
1956 class FacebookIE(InfoExtractor):
1957 """Information Extractor for Facebook"""
1960 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1961 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1962 _NETRC_MACHINE = 'facebook'
1963 _available_formats = ['video', 'highqual', 'lowqual']
1964 _video_extensions = {
1969 IE_NAME = u'facebook'
1971 def __init__(self, downloader=None):
1972 InfoExtractor.__init__(self, downloader)
1974 def _reporter(self, message):
1975 """Add header and report message."""
1976 self._downloader.to_screen(u'[facebook] %s' % message)
1978 def report_login(self):
1979 """Report attempt to log in."""
1980 self._reporter(u'Logging in')
1982 def report_video_webpage_download(self, video_id):
1983 """Report attempt to download video webpage."""
1984 self._reporter(u'%s: Downloading video webpage' % video_id)
1986 def report_information_extraction(self, video_id):
1987 """Report attempt to extract video information."""
1988 self._reporter(u'%s: Extracting video information' % video_id)
1990 def _parse_page(self, video_webpage):
1991 """Extract video information from page"""
1993 data = {'title': r'\("video_title", "(.*?)"\)',
1994 'description': r'<div class="datawrap">(.*?)</div>',
1995 'owner': r'\("video_owner_name", "(.*?)"\)',
1996 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1999 for piece in data.keys():
2000 mobj = re.search(data[piece], video_webpage)
2001 if mobj is not None:
2002 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2006 for fmt in self._available_formats:
2007 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2008 if mobj is not None:
2009 # URL is in a Javascript segment inside an escaped Unicode format within
2010 # the generally utf-8 page
2011 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2012 video_info['video_urls'] = video_urls
2016 def _real_initialize(self):
2017 if self._downloader is None:
2022 downloader_params = self._downloader.params
2024 # Attempt to use provided username and password or .netrc data
2025 if downloader_params.get('username', None) is not None:
2026 useremail = downloader_params['username']
2027 password = downloader_params['password']
2028 elif downloader_params.get('usenetrc', False):
2030 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2031 if info is not None:
2035 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2036 except (IOError, netrc.NetrcParseError) as err:
2037 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2040 if useremail is None:
2049 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2052 login_results = compat_urllib_request.urlopen(request).read()
2053 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2054 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2056 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2057 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2060 def _real_extract(self, url):
2061 mobj = re.match(self._VALID_URL, url)
2063 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2065 video_id = mobj.group('ID')
2068 self.report_video_webpage_download(video_id)
2069 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2071 page = compat_urllib_request.urlopen(request)
2072 video_webpage = page.read()
2073 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2074 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2077 # Start extracting information
2078 self.report_information_extraction(video_id)
2080 # Extract information
2081 video_info = self._parse_page(video_webpage)
2084 if 'owner' not in video_info:
2085 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2087 video_uploader = video_info['owner']
2090 if 'title' not in video_info:
2091 self._downloader.trouble(u'ERROR: unable to extract video title')
2093 video_title = video_info['title']
2094 video_title = video_title.decode('utf-8')
2097 if 'thumbnail' not in video_info:
2098 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2099 video_thumbnail = ''
2101 video_thumbnail = video_info['thumbnail']
2105 if 'upload_date' in video_info:
2106 upload_time = video_info['upload_date']
2107 timetuple = email.utils.parsedate_tz(upload_time)
2108 if timetuple is not None:
2110 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2115 video_description = video_info.get('description', 'No description available.')
2117 url_map = video_info['video_urls']
2118 if len(url_map.keys()) > 0:
2119 # Decide which formats to download
2120 req_format = self._downloader.params.get('format', None)
2121 format_limit = self._downloader.params.get('format_limit', None)
2123 if format_limit is not None and format_limit in self._available_formats:
2124 format_list = self._available_formats[self._available_formats.index(format_limit):]
2126 format_list = self._available_formats
2127 existing_formats = [x for x in format_list if x in url_map]
2128 if len(existing_formats) == 0:
2129 self._downloader.trouble(u'ERROR: no known formats available for video')
2131 if req_format is None:
2132 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2133 elif req_format == 'worst':
2134 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2135 elif req_format == '-1':
2136 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2139 if req_format not in url_map:
2140 self._downloader.trouble(u'ERROR: requested format not available')
2142 video_url_list = [(req_format, url_map[req_format])] # Specific format
2145 for format_param, video_real_url in video_url_list:
2147 video_extension = self._video_extensions.get(format_param, 'mp4')
2150 'id': video_id.decode('utf-8'),
2151 'url': video_real_url.decode('utf-8'),
2152 'uploader': video_uploader.decode('utf-8'),
2153 'upload_date': upload_date,
2154 'title': video_title,
2155 'ext': video_extension.decode('utf-8'),
2156 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2157 'thumbnail': video_thumbnail.decode('utf-8'),
2158 'description': video_description.decode('utf-8'),
2162 class BlipTVIE(InfoExtractor):
2163 """Information extractor for blip.tv"""
2165 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2166 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2167 IE_NAME = u'blip.tv'
2169 def report_extraction(self, file_id):
2170 """Report information extraction."""
2171 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2173 def report_direct_download(self, title):
2174 """Report information extraction."""
2175 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2177 def _real_extract(self, url):
2178 mobj = re.match(self._VALID_URL, url)
2180 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2187 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2188 request = compat_urllib_request.Request(json_url)
2189 self.report_extraction(mobj.group(1))
2192 urlh = compat_urllib_request.urlopen(request)
2193 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2194 basename = url.split('/')[-1]
2195 title,ext = os.path.splitext(basename)
2196 title = title.decode('UTF-8')
2197 ext = ext.replace('.', '')
2198 self.report_direct_download(title)
2203 'upload_date': None,
2208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2209 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2211 if info is None: # Regular URL
2213 json_code_bytes = urlh.read()
2214 json_code = json_code_bytes.decode('utf-8')
2215 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2216 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2220 json_data = json.loads(json_code)
2221 if 'Post' in json_data:
2222 data = json_data['Post']
2226 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2227 video_url = data['media']['url']
2228 umobj = re.match(self._URL_EXT, video_url)
2230 raise ValueError('Can not determine filename extension')
2231 ext = umobj.group(1)
2234 'id': data['item_id'],
2236 'uploader': data['display_name'],
2237 'upload_date': upload_date,
2238 'title': data['title'],
2240 'format': data['media']['mimeType'],
2241 'thumbnail': data['thumbnailUrl'],
2242 'description': data['description'],
2243 'player_url': data['embedUrl']
2245 except (ValueError,KeyError) as err:
2246 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2249 std_headers['User-Agent'] = 'iTunes/10.6.1'
2253 class MyVideoIE(InfoExtractor):
2254 """Information Extractor for myvideo.de."""
2256 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2257 IE_NAME = u'myvideo'
2259 def __init__(self, downloader=None):
2260 InfoExtractor.__init__(self, downloader)
2262 def report_download_webpage(self, video_id):
2263 """Report webpage download."""
2264 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2266 def report_extraction(self, video_id):
2267 """Report information extraction."""
2268 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2270 def _real_extract(self,url):
2271 mobj = re.match(self._VALID_URL, url)
2273 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2276 video_id = mobj.group(1)
2279 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2281 self.report_download_webpage(video_id)
2282 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2283 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2284 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2287 self.report_extraction(video_id)
2288 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2291 self._downloader.trouble(u'ERROR: unable to extract media URL')
2293 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2295 mobj = re.search('<title>([^<]+)</title>', webpage)
2297 self._downloader.trouble(u'ERROR: unable to extract title')
2300 video_title = mobj.group(1)
2306 'upload_date': None,
2307 'title': video_title,
2311 class ComedyCentralIE(InfoExtractor):
2312 """Information extractor for The Daily Show and Colbert Report """
2314 # urls can be abbreviations like :thedailyshow or :colbert
2315 # urls for episodes like:
2316 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2317 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2318 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2319 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2320 |(https?://)?(www\.)?
2321 (?P<showname>thedailyshow|colbertnation)\.com/
2322 (full-episodes/(?P<episode>.*)|
2324 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2325 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2327 IE_NAME = u'comedycentral'
2329 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2331 _video_extensions = {
2339 _video_dimensions = {
2348 def suitable(self, url):
2349 """Receives a URL and returns True if suitable for this IE."""
2350 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2352 def report_extraction(self, episode_id):
2353 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2355 def report_config_download(self, episode_id):
2356 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2358 def report_index_download(self, episode_id):
2359 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2361 def report_player_url(self, episode_id):
2362 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2365 def _print_formats(self, formats):
2366 print('Available formats:')
2368 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2371 def _real_extract(self, url):
2372 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2374 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2377 if mobj.group('shortname'):
2378 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2379 url = u'http://www.thedailyshow.com/full-episodes/'
2381 url = u'http://www.colbertnation.com/full-episodes/'
2382 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2383 assert mobj is not None
2385 if mobj.group('clip'):
2386 if mobj.group('showname') == 'thedailyshow':
2387 epTitle = mobj.group('tdstitle')
2389 epTitle = mobj.group('cntitle')
2392 dlNewest = not mobj.group('episode')
2394 epTitle = mobj.group('showname')
2396 epTitle = mobj.group('episode')
2398 req = compat_urllib_request.Request(url)
2399 self.report_extraction(epTitle)
2401 htmlHandle = compat_urllib_request.urlopen(req)
2402 html = htmlHandle.read()
2403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2404 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2407 url = htmlHandle.geturl()
2408 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2410 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2412 if mobj.group('episode') == '':
2413 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2415 epTitle = mobj.group('episode')
2417 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2419 if len(mMovieParams) == 0:
2420 # The Colbert Report embeds the information in a without
2421 # a URL prefix; so extract the alternate reference
2422 # and then add the URL prefix manually.
2424 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2425 if len(altMovieParams) == 0:
2426 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2429 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2431 playerUrl_raw = mMovieParams[0][0]
2432 self.report_player_url(epTitle)
2434 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2435 playerUrl = urlHandle.geturl()
2436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2437 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2440 uri = mMovieParams[0][1]
2441 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2442 self.report_index_download(epTitle)
2444 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2446 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2451 idoc = xml.etree.ElementTree.fromstring(indexXml)
2452 itemEls = idoc.findall('.//item')
2453 for itemEl in itemEls:
2454 mediaId = itemEl.findall('./guid')[0].text
2455 shortMediaId = mediaId.split(':')[-1]
2456 showId = mediaId.split(':')[-2].replace('.com', '')
2457 officialTitle = itemEl.findall('./title')[0].text
2458 officialDate = itemEl.findall('./pubDate')[0].text
2460 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2461 compat_urllib_parse.urlencode({'uri': mediaId}))
2462 configReq = compat_urllib_request.Request(configUrl)
2463 self.report_config_download(epTitle)
2465 configXml = compat_urllib_request.urlopen(configReq).read()
2466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2470 cdoc = xml.etree.ElementTree.fromstring(configXml)
2472 for rendition in cdoc.findall('.//rendition'):
2473 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2477 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2480 if self._downloader.params.get('listformats', None):
2481 self._print_formats([i[0] for i in turls])
2484 # For now, just pick the highest bitrate
2485 format,video_url = turls[-1]
2487 # Get the format arg from the arg stream
2488 req_format = self._downloader.params.get('format', None)
2490 # Select format if we can find one
2493 format, video_url = f, v
2496 # Patch to download from alternative CDN, which does not
2497 # break on current RTMPDump builds
2498 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2499 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2501 if video_url.startswith(broken_cdn):
2502 video_url = video_url.replace(broken_cdn, better_cdn)
2504 effTitle = showId + u'-' + epTitle
2509 'upload_date': officialDate,
2514 'description': officialTitle,
2515 'player_url': None #playerUrl
2518 results.append(info)
2523 class EscapistIE(InfoExtractor):
2524 """Information extractor for The Escapist """
2526 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2527 IE_NAME = u'escapist'
2529 def report_extraction(self, showName):
2530 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2532 def report_config_download(self, showName):
2533 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2535 def _real_extract(self, url):
2536 mobj = re.match(self._VALID_URL, url)
2538 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2540 showName = mobj.group('showname')
2541 videoId = mobj.group('episode')
2543 self.report_extraction(showName)
2545 webPage = compat_urllib_request.urlopen(url)
2546 webPageBytes = webPage.read()
2547 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2548 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2553 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2554 description = unescapeHTML(descMatch.group(1))
2555 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2556 imgUrl = unescapeHTML(imgMatch.group(1))
2557 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2558 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2559 configUrlMatch = re.search('config=(.*)$', playerUrl)
2560 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2562 self.report_config_download(showName)
2564 configJSON = compat_urllib_request.urlopen(configUrl)
2565 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2566 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2571 # Technically, it's JavaScript, not JSON
2572 configJSON = configJSON.replace("'", '"')
2575 config = json.loads(configJSON)
2576 except (ValueError,) as err:
2577 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2580 playlist = config['playlist']
2581 videoUrl = playlist[1]['url']
2586 'uploader': showName,
2587 'upload_date': None,
2590 'thumbnail': imgUrl,
2591 'description': description,
2592 'player_url': playerUrl,
2598 class CollegeHumorIE(InfoExtractor):
2599 """Information extractor for collegehumor.com"""
2602 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2603 IE_NAME = u'collegehumor'
2605 def report_manifest(self, video_id):
2606 """Report information extraction."""
2607 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2609 def report_extraction(self, video_id):
2610 """Report information extraction."""
2611 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2613 def _real_extract(self, url):
2614 mobj = re.match(self._VALID_URL, url)
2616 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2618 video_id = mobj.group('videoid')
2623 'upload_date': None,
2626 self.report_extraction(video_id)
2627 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2629 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2630 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2631 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2634 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2636 videoNode = mdoc.findall('./video')[0]
2637 info['description'] = videoNode.findall('./description')[0].text
2638 info['title'] = videoNode.findall('./caption')[0].text
2639 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2640 manifest_url = videoNode.findall('./file')[0].text
2642 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2645 manifest_url += '?hdcore=2.10.3'
2646 self.report_manifest(video_id)
2648 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2653 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2655 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2656 node_id = media_node.attrib['url']
2657 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2658 except IndexError as err:
2659 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2662 url_pr = compat_urllib_parse_urlparse(manifest_url)
2663 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2670 class XVideosIE(InfoExtractor):
2671 """Information extractor for xvideos.com"""
2673 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2674 IE_NAME = u'xvideos'
2676 def report_webpage(self, video_id):
2677 """Report information extraction."""
2678 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2680 def report_extraction(self, video_id):
2681 """Report information extraction."""
2682 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2684 def _real_extract(self, url):
2685 mobj = re.match(self._VALID_URL, url)
2687 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2689 video_id = mobj.group(1)
2691 self.report_webpage(video_id)
2693 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2695 webpage_bytes = compat_urllib_request.urlopen(request).read()
2696 webpage = webpage_bytes.decode('utf-8', 'replace')
2697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2698 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2701 self.report_extraction(video_id)
2705 mobj = re.search(r'flv_url=(.+?)&', webpage)
2707 self._downloader.trouble(u'ERROR: unable to extract video url')
2709 video_url = compat_urllib_parse.unquote(mobj.group(1))
2713 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2715 self._downloader.trouble(u'ERROR: unable to extract video title')
2717 video_title = mobj.group(1)
2720 # Extract video thumbnail
2721 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2723 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2725 video_thumbnail = mobj.group(0)
2731 'upload_date': None,
2732 'title': video_title,
2734 'thumbnail': video_thumbnail,
2735 'description': None,
2741 class SoundcloudIE(InfoExtractor):
2742 """Information extractor for soundcloud.com
2743 To access the media, the uid of the song and a stream token
2744 must be extracted from the page source and the script must make
2745 a request to media.soundcloud.com/crossdomain.xml. Then
2746 the media can be grabbed by requesting from an url composed
2747 of the stream token and uid
2750 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2751 IE_NAME = u'soundcloud'
2753 def __init__(self, downloader=None):
2754 InfoExtractor.__init__(self, downloader)
2756 def report_resolve(self, video_id):
2757 """Report information extraction."""
2758 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2760 def report_extraction(self, video_id):
2761 """Report information extraction."""
2762 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2764 def _real_extract(self, url):
2765 mobj = re.match(self._VALID_URL, url)
2767 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2770 # extract uploader (which is in the url)
2771 uploader = mobj.group(1)
2772 # extract simple title (uploader + slug of song title)
2773 slug_title = mobj.group(2)
2774 simple_title = uploader + u'-' + slug_title
2776 self.report_resolve('%s/%s' % (uploader, slug_title))
2778 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2779 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2780 request = compat_urllib_request.Request(resolv_url)
2782 info_json_bytes = compat_urllib_request.urlopen(request).read()
2783 info_json = info_json_bytes.decode('utf-8')
2784 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2785 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2788 info = json.loads(info_json)
2789 video_id = info['id']
2790 self.report_extraction('%s/%s' % (uploader, slug_title))
2792 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2793 request = compat_urllib_request.Request(streams_url)
2795 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2796 stream_json = stream_json_bytes.decode('utf-8')
2797 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2798 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2801 streams = json.loads(stream_json)
2802 mediaURL = streams['http_mp3_128_url']
2807 'uploader': info['user']['username'],
2808 'upload_date': info['created_at'],
2809 'title': info['title'],
2811 'description': info['description'],
2815 class InfoQIE(InfoExtractor):
2816 """Information extractor for infoq.com"""
2818 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2821 def report_webpage(self, video_id):
2822 """Report information extraction."""
2823 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2825 def report_extraction(self, video_id):
2826 """Report information extraction."""
2827 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2829 def _real_extract(self, url):
2830 mobj = re.match(self._VALID_URL, url)
2832 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2835 self.report_webpage(url)
2837 request = compat_urllib_request.Request(url)
2839 webpage = compat_urllib_request.urlopen(request).read()
2840 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2841 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2844 self.report_extraction(url)
2848 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2850 self._downloader.trouble(u'ERROR: unable to extract video url')
2852 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2856 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2858 self._downloader.trouble(u'ERROR: unable to extract video title')
2860 video_title = mobj.group(1).decode('utf-8')
2862 # Extract description
2863 video_description = u'No description available.'
2864 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2865 if mobj is not None:
2866 video_description = mobj.group(1).decode('utf-8')
2868 video_filename = video_url.split('/')[-1]
2869 video_id, extension = video_filename.split('.')
2875 'upload_date': None,
2876 'title': video_title,
2877 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2879 'description': video_description,
2884 class MixcloudIE(InfoExtractor):
2885 """Information extractor for www.mixcloud.com"""
2887 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2888 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2889 IE_NAME = u'mixcloud'
2891 def __init__(self, downloader=None):
2892 InfoExtractor.__init__(self, downloader)
2894 def report_download_json(self, file_id):
2895 """Report JSON download."""
2896 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2898 def report_extraction(self, file_id):
2899 """Report information extraction."""
2900 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2902 def get_urls(self, jsonData, fmt, bitrate='best'):
2903 """Get urls from 'audio_formats' section in json"""
2906 bitrate_list = jsonData[fmt]
2907 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2908 bitrate = max(bitrate_list) # select highest
2910 url_list = jsonData[fmt][bitrate]
2911 except TypeError: # we have no bitrate info.
2912 url_list = jsonData[fmt]
2915 def check_urls(self, url_list):
2916 """Returns 1st active url from list"""
2917 for url in url_list:
2919 compat_urllib_request.urlopen(url)
2921 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926 def _print_formats(self, formats):
2927 print('Available formats:')
2928 for fmt in formats.keys():
2929 for b in formats[fmt]:
2931 ext = formats[fmt][b][0]
2932 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2933 except TypeError: # we have no bitrate info
2934 ext = formats[fmt][0]
2935 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2938 def _real_extract(self, url):
2939 mobj = re.match(self._VALID_URL, url)
2941 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2943 # extract uploader & filename from url
2944 uploader = mobj.group(1).decode('utf-8')
2945 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2947 # construct API request
2948 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2949 # retrieve .json file with links to files
2950 request = compat_urllib_request.Request(file_url)
2952 self.report_download_json(file_url)
2953 jsonData = compat_urllib_request.urlopen(request).read()
2954 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2955 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2959 json_data = json.loads(jsonData)
2960 player_url = json_data['player_swf_url']
2961 formats = dict(json_data['audio_formats'])
2963 req_format = self._downloader.params.get('format', None)
2966 if self._downloader.params.get('listformats', None):
2967 self._print_formats(formats)
2970 if req_format is None or req_format == 'best':
2971 for format_param in formats.keys():
2972 url_list = self.get_urls(formats, format_param)
2974 file_url = self.check_urls(url_list)
2975 if file_url is not None:
2978 if req_format not in formats.keys():
2979 self._downloader.trouble(u'ERROR: format is not available')
2982 url_list = self.get_urls(formats, req_format)
2983 file_url = self.check_urls(url_list)
2984 format_param = req_format
2987 'id': file_id.decode('utf-8'),
2988 'url': file_url.decode('utf-8'),
2989 'uploader': uploader.decode('utf-8'),
2990 'upload_date': None,
2991 'title': json_data['name'],
2992 'ext': file_url.split('.')[-1].decode('utf-8'),
2993 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2994 'thumbnail': json_data['thumbnail_url'],
2995 'description': json_data['description'],
2996 'player_url': player_url.decode('utf-8'),
2999 class StanfordOpenClassroomIE(InfoExtractor):
3000 """Information extractor for Stanford's Open ClassRoom"""
3002 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3003 IE_NAME = u'stanfordoc'
3005 def report_download_webpage(self, objid):
3006 """Report information extraction."""
3007 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3009 def report_extraction(self, video_id):
3010 """Report information extraction."""
3011 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3013 def _real_extract(self, url):
3014 mobj = re.match(self._VALID_URL, url)
3016 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3019 if mobj.group('course') and mobj.group('video'): # A specific video
3020 course = mobj.group('course')
3021 video = mobj.group('video')
3023 'id': course + '_' + video,
3025 'upload_date': None,
3028 self.report_extraction(info['id'])
3029 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3030 xmlUrl = baseUrl + video + '.xml'
3032 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3033 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3034 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3036 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3038 info['title'] = mdoc.findall('./title')[0].text
3039 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3041 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3043 info['ext'] = info['url'].rpartition('.')[2]
3045 elif mobj.group('course'): # A course page
3046 course = mobj.group('course')
3051 'upload_date': None,
3054 self.report_download_webpage(info['id'])
3056 coursepage = compat_urllib_request.urlopen(url).read()
3057 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3058 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3061 m = re.search('<h1>([^<]+)</h1>', coursepage)
3063 info['title'] = unescapeHTML(m.group(1))
3065 info['title'] = info['id']
3067 m = re.search('<description>([^<]+)</description>', coursepage)
3069 info['description'] = unescapeHTML(m.group(1))
3071 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3074 'type': 'reference',
3075 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3079 for entry in info['list']:
3080 assert entry['type'] == 'reference'
3081 results += self.extract(entry['url'])
3086 'id': 'Stanford OpenClassroom',
3089 'upload_date': None,
3092 self.report_download_webpage(info['id'])
3093 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3095 rootpage = compat_urllib_request.urlopen(rootURL).read()
3096 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3097 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3100 info['title'] = info['id']
3102 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3105 'type': 'reference',
3106 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3111 for entry in info['list']:
3112 assert entry['type'] == 'reference'
3113 results += self.extract(entry['url'])
3116 class MTVIE(InfoExtractor):
3117 """Information extractor for MTV.com"""
3119 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3122 def report_webpage(self, video_id):
3123 """Report information extraction."""
3124 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3126 def report_extraction(self, video_id):
3127 """Report information extraction."""
3128 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3130 def _real_extract(self, url):
3131 mobj = re.match(self._VALID_URL, url)
3133 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3135 if not mobj.group('proto'):
3136 url = 'http://' + url
3137 video_id = mobj.group('videoid')
3138 self.report_webpage(video_id)
3140 request = compat_urllib_request.Request(url)
3142 webpage = compat_urllib_request.urlopen(request).read()
3143 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3144 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3147 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3149 self._downloader.trouble(u'ERROR: unable to extract song name')
3151 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3152 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3154 self._downloader.trouble(u'ERROR: unable to extract performer')
3156 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3157 video_title = performer + ' - ' + song_name
3159 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3161 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3163 mtvn_uri = mobj.group(1)
3165 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3167 self._downloader.trouble(u'ERROR: unable to extract content id')
3169 content_id = mobj.group(1)
3171 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3172 self.report_extraction(video_id)
3173 request = compat_urllib_request.Request(videogen_url)
3175 metadataXml = compat_urllib_request.urlopen(request).read()
3176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3177 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3180 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3181 renditions = mdoc.findall('.//rendition')
3183 # For now, always pick the highest quality.
3184 rendition = renditions[-1]
3187 _,_,ext = rendition.attrib['type'].partition('/')
3188 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3189 video_url = rendition.find('./src').text
3191 self._downloader.trouble('Invalid rendition field.')
3197 'uploader': performer,
3198 'upload_date': None,
3199 'title': video_title,
3207 class YoukuIE(InfoExtractor):
3209 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3212 def __init__(self, downloader=None):
3213 InfoExtractor.__init__(self, downloader)
3215 def report_download_webpage(self, file_id):
3216 """Report webpage download."""
3217 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3219 def report_extraction(self, file_id):
3220 """Report information extraction."""
3221 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3224 nowTime = int(time.time() * 1000)
3225 random1 = random.randint(1000,1998)
3226 random2 = random.randint(1000,9999)
3228 return "%d%d%d" %(nowTime,random1,random2)
3230 def _get_file_ID_mix_string(self, seed):
3232 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3234 for i in range(len(source)):
3235 seed = (seed * 211 + 30031 ) % 65536
3236 index = math.floor(seed / 65536 * len(source) )
3237 mixed.append(source[int(index)])
3238 source.remove(source[int(index)])
3239 #return ''.join(mixed)
3242 def _get_file_id(self, fileId, seed):
3243 mixed = self._get_file_ID_mix_string(seed)
3244 ids = fileId.split('*')
3248 realId.append(mixed[int(ch)])
3249 return ''.join(realId)
3251 def _real_extract(self, url):
3252 mobj = re.match(self._VALID_URL, url)
3254 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3256 video_id = mobj.group('ID')
3258 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3260 request = compat_urllib_request.Request(info_url, None, std_headers)
3262 self.report_download_webpage(video_id)
3263 jsondata = compat_urllib_request.urlopen(request).read()
3264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3265 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3268 self.report_extraction(video_id)
3270 jsonstr = jsondata.decode('utf-8')
3271 config = json.loads(jsonstr)
3273 video_title = config['data'][0]['title']
3274 seed = config['data'][0]['seed']
3276 format = self._downloader.params.get('format', None)
3277 supported_format = config['data'][0]['streamfileids'].keys()
3279 if format is None or format == 'best':
3280 if 'hd2' in supported_format:
3285 elif format == 'worst':
3293 fileid = config['data'][0]['streamfileids'][format]
3294 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3295 except (UnicodeDecodeError, ValueError, KeyError):
3296 self._downloader.trouble(u'ERROR: unable to extract info section')
3300 sid = self._gen_sid()
3301 fileid = self._get_file_id(fileid, seed)
3303 #column 8,9 of fileid represent the segment number
3304 #fileid[7:9] should be changed
3305 for index, key in enumerate(keys):
3307 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3308 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3311 'id': '%s_part%02d' % (video_id, index),
3312 'url': download_url,
3314 'upload_date': None,
3315 'title': video_title,
3318 files_info.append(info)
3323 class XNXXIE(InfoExtractor):
3324 """Information extractor for xnxx.com"""
3326 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3328 VIDEO_URL_RE = r'flv_url=(.*?)&'
3329 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3330 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3332 def report_webpage(self, video_id):
3333 """Report information extraction"""
3334 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3336 def report_extraction(self, video_id):
3337 """Report information extraction"""
3338 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3340 def _real_extract(self, url):
3341 mobj = re.match(self._VALID_URL, url)
3343 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3345 video_id = mobj.group(1)
3347 self.report_webpage(video_id)
3349 # Get webpage content
3351 webpage_bytes = compat_urllib_request.urlopen(url).read()
3352 webpage = webpage_bytes.decode('utf-8')
3353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3354 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3357 result = re.search(self.VIDEO_URL_RE, webpage)
3359 self._downloader.trouble(u'ERROR: unable to extract video url')
3361 video_url = compat_urllib_parse.unquote(result.group(1))
3363 result = re.search(self.VIDEO_TITLE_RE, webpage)
3365 self._downloader.trouble(u'ERROR: unable to extract video title')
3367 video_title = result.group(1)
3369 result = re.search(self.VIDEO_THUMB_RE, webpage)
3371 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3373 video_thumbnail = result.group(1)
3379 'upload_date': None,
3380 'title': video_title,
3382 'thumbnail': video_thumbnail,
3383 'description': None,
3387 class GooglePlusIE(InfoExtractor):
3388 """Information extractor for plus.google.com."""
3390 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3391 IE_NAME = u'plus.google'
3393 def __init__(self, downloader=None):
3394 InfoExtractor.__init__(self, downloader)
3396 def report_extract_entry(self, url):
3397 """Report downloading extry"""
3398 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3400 def report_date(self, upload_date):
3401 """Report downloading extry"""
3402 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3404 def report_uploader(self, uploader):
3405 """Report downloading extry"""
3406 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3408 def report_title(self, video_title):
3409 """Report downloading extry"""
3410 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3412 def report_extract_vid_page(self, video_page):
3413 """Report information extraction."""
3414 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3416 def _real_extract(self, url):
3417 # Extract id from URL
3418 mobj = re.match(self._VALID_URL, url)
3420 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3423 post_url = mobj.group(0)
3424 video_id = mobj.group(1)
3426 video_extension = 'flv'
3428 # Step 1, Retrieve post webpage to extract further information
3429 self.report_extract_entry(post_url)
3430 request = compat_urllib_request.Request(post_url)
3432 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3433 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3434 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3437 # Extract update date
3439 pattern = 'title="Timestamp">(.*?)</a>'
3440 mobj = re.search(pattern, webpage)
3442 upload_date = mobj.group(1)
3443 # Convert timestring to a format suitable for filename
3444 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3445 upload_date = upload_date.strftime('%Y%m%d')
3446 self.report_date(upload_date)
3450 pattern = r'rel\="author".*?>(.*?)</a>'
3451 mobj = re.search(pattern, webpage)
3453 uploader = mobj.group(1)
3454 self.report_uploader(uploader)
3457 # Get the first line for title
3459 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3460 mobj = re.search(pattern, webpage)
3462 video_title = mobj.group(1)
3463 self.report_title(video_title)
3465 # Step 2, Stimulate clicking the image box to launch video
3466 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3467 mobj = re.search(pattern, webpage)
3469 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3471 video_page = mobj.group(1)
3472 request = compat_urllib_request.Request(video_page)
3474 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3475 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3476 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3478 self.report_extract_vid_page(video_page)
3481 # Extract video links on video page
3482 """Extract video links of all sizes"""
3483 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3484 mobj = re.findall(pattern, webpage)
3486 self._downloader.trouble(u'ERROR: unable to extract video links')
3488 # Sort in resolution
3489 links = sorted(mobj)
3491 # Choose the lowest of the sort, i.e. highest resolution
3492 video_url = links[-1]
3493 # Only get the url. The resolution part in the tuple has no use anymore
3494 video_url = video_url[-1]
3495 # Treat escaped \u0026 style hex
3497 video_url = video_url.decode("unicode_escape")
3498 except AttributeError: # Python 3
3499 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3505 'uploader': uploader,
3506 'upload_date': upload_date,
3507 'title': video_title,
3508 'ext': video_extension,
3511 class NBAIE(InfoExtractor):
3512 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3515 def report_extraction(self, video_id):
3516 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3518 def _real_extract(self, url):
3519 mobj = re.match(self._VALID_URL, url)
3521 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3524 video_id = mobj.group(1)
3525 if video_id.endswith('/index.html'):
3526 video_id = video_id[:-len('/index.html')]
3528 self.report_extraction(video_id)
3530 urlh = compat_urllib_request.urlopen(url)
3531 webpage_bytes = urlh.read()
3532 webpage = webpage_bytes.decode('utf-8', 'ignore')
3533 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3534 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3537 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3538 def _findProp(rexp, default=None):
3539 m = re.search(rexp, webpage)
3541 return unescapeHTML(m.group(1))
3545 shortened_video_id = video_id.rpartition('/')[2]
3546 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3548 'id': shortened_video_id,
3552 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3553 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3557 class JustinTVIE(InfoExtractor):
3558 """Information extractor for justin.tv and twitch.tv"""
3559 # TODO: One broadcast may be split into multiple videos. The key
3560 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3561 # starts at 1 and increases. Can we treat all parts as one video?
3563 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3564 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3565 _JUSTIN_PAGE_LIMIT = 100
3566 IE_NAME = u'justin.tv'
3568 def report_extraction(self, file_id):
3569 """Report information extraction."""
3570 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3572 def report_download_page(self, channel, offset):
3573 """Report attempt to download a single page of videos."""
3574 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3575 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3577 # Return count of items, list of *valid* items
3578 def _parse_page(self, url):
3580 urlh = compat_urllib_request.urlopen(url)
3581 webpage_bytes = urlh.read()
3582 webpage = webpage_bytes.decode('utf-8', 'ignore')
3583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3584 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3587 response = json.loads(webpage)
3589 for clip in response:
3590 video_url = clip['video_file_url']
3592 video_extension = os.path.splitext(video_url)[1][1:]
3593 video_date = re.sub('-', '', clip['created_on'][:10])
3597 'title': clip['title'],
3598 'uploader': clip.get('user_id', clip.get('channel_id')),
3599 'upload_date': video_date,
3600 'ext': video_extension,
3602 return (len(response), info)
3604 def _real_extract(self, url):
3605 mobj = re.match(self._VALID_URL, url)
3607 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3610 api = 'http://api.justin.tv'
3611 video_id = mobj.group(mobj.lastindex)
3613 if mobj.lastindex == 1:
3615 api += '/channel/archives/%s.json'
3617 api += '/clip/show/%s.json'
3618 api = api % (video_id,)
3620 self.report_extraction(video_id)
3624 limit = self._JUSTIN_PAGE_LIMIT
3627 self.report_download_page(video_id, offset)
3628 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3629 page_count, page_info = self._parse_page(page_url)
3630 info.extend(page_info)
3631 if not paged or page_count != limit: