2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self).__name__[:-2]
108 class YoutubeIE(InfoExtractor):
109 """Information extractor for youtube.com."""
113 (?:https?://)? # http(s):// (optional)
114 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
115 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
116 (?:.*?\#/)? # handle anchor (#/) redirect urls
117 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
118 (?: # the various things that can precede the ID:
119 (?:(?:v|embed|e)/) # v/ or embed/ or e/
120 |(?: # or the v= param in all its forms
121 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
122 (?:\?|\#!?) # the params delimiter ? or # or #!
123 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
126 )? # optional -> youtube.com/xxxx is OK
127 )? # all until now is optional -> you can pass the naked ID
128 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
129 (?(1).+)? # if we found the ID, everything can follow
131 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
132 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
133 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
134 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
135 _NETRC_MACHINE = 'youtube'
136 # Listed in order of quality
137 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
138 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
139 _video_extensions = {
145 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
151 _video_dimensions = {
169 def suitable(self, url):
170 """Receives a URL and returns True if suitable for this IE."""
171 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
173 def report_lang(self):
174 """Report attempt to set language."""
175 self._downloader.to_screen(u'[youtube] Setting language')
177 def report_login(self):
178 """Report attempt to log in."""
179 self._downloader.to_screen(u'[youtube] Logging in')
181 def report_age_confirmation(self):
182 """Report attempt to confirm age."""
183 self._downloader.to_screen(u'[youtube] Confirming age')
185 def report_video_webpage_download(self, video_id):
186 """Report attempt to download video webpage."""
187 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
189 def report_video_info_webpage_download(self, video_id):
190 """Report attempt to download video info webpage."""
191 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
193 def report_video_subtitles_download(self, video_id):
194 """Report attempt to download video info webpage."""
195 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
197 def report_information_extraction(self, video_id):
198 """Report attempt to extract video information."""
199 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
201 def report_unavailable_format(self, video_id, format):
202 """Report extracted video URL."""
203 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
205 def report_rtmp_download(self):
206 """Indicate the download will use the RTMP protocol."""
207 self._downloader.to_screen(u'[youtube] RTMP download detected')
209 def _closed_captions_xml_to_srt(self, xml_string):
211 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
212 # TODO parse xml instead of regex
213 for n, (start, dur_tag, dur, caption) in enumerate(texts):
214 if not dur: dur = '4'
216 end = start + float(dur)
217 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
218 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
219 caption = unescapeHTML(caption)
220 caption = unescapeHTML(caption) # double cycle, intentional
221 srt += str(n+1) + '\n'
222 srt += start + ' --> ' + end + '\n'
223 srt += caption + '\n\n'
226 def _extract_subtitles(self, video_id):
227 self.report_video_subtitles_download(video_id)
228 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
230 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
232 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
233 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
234 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
235 if not srt_lang_list:
236 return (u'WARNING: video has no closed captions', None)
237 if self._downloader.params.get('subtitleslang', False):
238 srt_lang = self._downloader.params.get('subtitleslang')
239 elif 'en' in srt_lang_list:
242 srt_lang = list(srt_lang_list.keys())[0]
243 if not srt_lang in srt_lang_list:
244 return (u'WARNING: no closed captions found in the specified language', None)
245 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
247 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
249 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
251 return (u'WARNING: unable to download video subtitles', None)
252 return (None, self._closed_captions_xml_to_srt(srt_xml))
254 def _print_formats(self, formats):
255 print('Available formats:')
257 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
259 def _real_initialize(self):
260 if self._downloader is None:
265 downloader_params = self._downloader.params
267 # Attempt to use provided username and password or .netrc data
268 if downloader_params.get('username', None) is not None:
269 username = downloader_params['username']
270 password = downloader_params['password']
271 elif downloader_params.get('usenetrc', False):
273 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
278 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
279 except (IOError, netrc.NetrcParseError) as err:
280 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
284 request = compat_urllib_request.Request(self._LANG_URL)
287 compat_urllib_request.urlopen(request).read()
288 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
289 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
292 # No authentication to be performed
298 'current_form': 'loginForm',
300 'action_login': 'Log In',
301 'username': username,
302 'password': password,
304 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
307 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
308 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
309 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
318 'action_confirm': 'Confirm',
320 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
322 self.report_age_confirmation()
323 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
328 def _extract_id(self, url):
329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
331 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
333 video_id = mobj.group(2)
336 def _real_extract(self, url):
337 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
338 mobj = re.search(self._NEXT_URL_RE, url)
340 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
341 video_id = self._extract_id(url)
344 self.report_video_webpage_download(video_id)
345 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
346 request = compat_urllib_request.Request(url)
348 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
349 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
350 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
353 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
355 # Attempt to extract SWF player URL
356 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
358 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
363 self.report_video_info_webpage_download(video_id)
364 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
365 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
366 % (video_id, el_type))
367 request = compat_urllib_request.Request(video_info_url)
369 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
370 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
371 video_info = compat_parse_qs(video_info_webpage)
372 if 'token' in video_info:
374 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
375 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
377 if 'token' not in video_info:
378 if 'reason' in video_info:
379 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
381 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
384 # Check for "rental" videos
385 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
386 self._downloader.trouble(u'ERROR: "rental" videos not supported')
389 # Start extracting information
390 self.report_information_extraction(video_id)
393 if 'author' not in video_info:
394 self._downloader.trouble(u'ERROR: unable to extract uploader name')
396 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
399 video_uploader_id = None
400 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
402 video_uploader_id = mobj.group(1)
404 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
407 if 'title' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract video title')
410 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
413 if 'thumbnail_url' not in video_info:
414 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
416 else: # don't panic if we can't find it
417 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
421 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
423 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
424 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
425 for expression in format_expressions:
427 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
432 video_description = get_element_by_id("eow-description", video_webpage)
433 if video_description:
434 video_description = clean_html(video_description)
436 video_description = ''
439 video_subtitles = None
440 if self._downloader.params.get('writesubtitles', False):
441 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
443 self._downloader.trouble(srt_error)
445 if 'length_seconds' not in video_info:
446 self._downloader.trouble(u'WARNING: unable to extract video duration')
449 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
452 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
454 # Decide which formats to download
455 req_format = self._downloader.params.get('format', None)
457 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
458 self.report_rtmp_download()
459 video_url_list = [(None, video_info['conn'][0])]
460 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
461 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
462 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
463 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
464 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
466 format_limit = self._downloader.params.get('format_limit', None)
467 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
468 if format_limit is not None and format_limit in available_formats:
469 format_list = available_formats[available_formats.index(format_limit):]
471 format_list = available_formats
472 existing_formats = [x for x in format_list if x in url_map]
473 if len(existing_formats) == 0:
474 self._downloader.trouble(u'ERROR: no known formats available for video')
476 if self._downloader.params.get('listformats', None):
477 self._print_formats(existing_formats)
479 if req_format is None or req_format == 'best':
480 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
481 elif req_format == 'worst':
482 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
483 elif req_format in ('-1', 'all'):
484 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
486 # Specific formats. We pick the first in a slash-delimeted sequence.
487 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
488 req_formats = req_format.split('/')
489 video_url_list = None
490 for rf in req_formats:
492 video_url_list = [(rf, url_map[rf])]
494 if video_url_list is None:
495 self._downloader.trouble(u'ERROR: requested format not available')
498 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
502 for format_param, video_real_url in video_url_list:
504 video_extension = self._video_extensions.get(format_param, 'flv')
506 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
507 self._video_dimensions.get(format_param, '???'))
511 'url': video_real_url,
512 'uploader': video_uploader,
513 'uploader_id': video_uploader_id,
514 'upload_date': upload_date,
515 'title': video_title,
516 'ext': video_extension,
517 'format': video_format,
518 'thumbnail': video_thumbnail,
519 'description': video_description,
520 'player_url': player_url,
521 'subtitles': video_subtitles,
522 'duration': video_duration
527 class MetacafeIE(InfoExtractor):
528 """Information Extractor for metacafe.com."""
530 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
531 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
532 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
533 IE_NAME = u'metacafe'
535 def __init__(self, downloader=None):
536 InfoExtractor.__init__(self, downloader)
538 def report_disclaimer(self):
539 """Report disclaimer retrieval."""
540 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
542 def report_age_confirmation(self):
543 """Report attempt to confirm age."""
544 self._downloader.to_screen(u'[metacafe] Confirming age')
546 def report_download_webpage(self, video_id):
547 """Report webpage download."""
548 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
550 def report_extraction(self, video_id):
551 """Report information extraction."""
552 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
554 def _real_initialize(self):
555 # Retrieve disclaimer
556 request = compat_urllib_request.Request(self._DISCLAIMER)
558 self.report_disclaimer()
559 disclaimer = compat_urllib_request.urlopen(request).read()
560 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
561 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
567 'submit': "Continue - I'm over 18",
569 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
571 self.report_age_confirmation()
572 disclaimer = compat_urllib_request.urlopen(request).read()
573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
574 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
577 def _real_extract(self, url):
578 # Extract id and simplified title from URL
579 mobj = re.match(self._VALID_URL, url)
581 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
584 video_id = mobj.group(1)
586 # Check if video comes from YouTube
587 mobj2 = re.match(r'^yt-(.*)$', video_id)
588 if mobj2 is not None:
589 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
592 # Retrieve video webpage to extract further information
593 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
595 self.report_download_webpage(video_id)
596 webpage = compat_urllib_request.urlopen(request).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
601 # Extract URL, uploader and title from webpage
602 self.report_extraction(video_id)
603 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
605 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
606 video_extension = mediaURL[-3:]
608 # Extract gdaKey if available
609 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
613 gdaKey = mobj.group(1)
614 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
616 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
618 self._downloader.trouble(u'ERROR: unable to extract media URL')
620 vardict = compat_parse_qs(mobj.group(1))
621 if 'mediaData' not in vardict:
622 self._downloader.trouble(u'ERROR: unable to extract media URL')
624 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
626 self._downloader.trouble(u'ERROR: unable to extract media URL')
628 mediaURL = mobj.group(1).replace('\\/', '/')
629 video_extension = mediaURL[-3:]
630 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
632 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
634 self._downloader.trouble(u'ERROR: unable to extract title')
636 video_title = mobj.group(1).decode('utf-8')
638 mobj = re.search(r'submitter=(.*?);', webpage)
640 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
642 video_uploader = mobj.group(1)
645 'id': video_id.decode('utf-8'),
646 'url': video_url.decode('utf-8'),
647 'uploader': video_uploader.decode('utf-8'),
649 'title': video_title,
650 'ext': video_extension.decode('utf-8'),
654 class DailymotionIE(InfoExtractor):
655 """Information Extractor for Dailymotion"""
657 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
658 IE_NAME = u'dailymotion'
660 def __init__(self, downloader=None):
661 InfoExtractor.__init__(self, downloader)
663 def report_download_webpage(self, video_id):
664 """Report webpage download."""
665 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
667 def report_extraction(self, video_id):
668 """Report information extraction."""
669 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
671 def _real_extract(self, url):
672 # Extract id and simplified title from URL
673 mobj = re.match(self._VALID_URL, url)
675 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
678 video_id = mobj.group(1).split('_')[0].split('?')[0]
680 video_extension = 'mp4'
682 # Retrieve video webpage to extract further information
683 request = compat_urllib_request.Request(url)
684 request.add_header('Cookie', 'family_filter=off')
686 self.report_download_webpage(video_id)
687 webpage_bytes = compat_urllib_request.urlopen(request).read()
688 webpage = webpage_bytes.decode('utf-8')
689 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
690 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
693 # Extract URL, uploader and title from webpage
694 self.report_extraction(video_id)
695 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
697 self._downloader.trouble(u'ERROR: unable to extract media URL')
699 flashvars = compat_urllib_parse.unquote(mobj.group(1))
701 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
704 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
707 self._downloader.trouble(u'ERROR: unable to extract video URL')
710 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
712 self._downloader.trouble(u'ERROR: unable to extract video URL')
715 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
717 # TODO: support choosing qualities
719 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
721 self._downloader.trouble(u'ERROR: unable to extract title')
723 video_title = unescapeHTML(mobj.group('title'))
725 video_uploader = None
726 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
728 # lookin for official user
729 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
730 if mobj_official is None:
731 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
733 video_uploader = mobj_official.group(1)
735 video_uploader = mobj.group(1)
737 video_upload_date = None
738 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
740 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
745 'uploader': video_uploader,
746 'upload_date': video_upload_date,
747 'title': video_title,
748 'ext': video_extension,
752 class PhotobucketIE(InfoExtractor):
753 """Information extractor for photobucket.com."""
755 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
756 IE_NAME = u'photobucket'
758 def __init__(self, downloader=None):
759 InfoExtractor.__init__(self, downloader)
761 def report_download_webpage(self, video_id):
762 """Report webpage download."""
763 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
765 def report_extraction(self, video_id):
766 """Report information extraction."""
767 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
769 def _real_extract(self, url):
770 # Extract id from URL
771 mobj = re.match(self._VALID_URL, url)
773 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
776 video_id = mobj.group(1)
778 video_extension = 'flv'
780 # Retrieve video webpage to extract further information
781 request = compat_urllib_request.Request(url)
783 self.report_download_webpage(video_id)
784 webpage = compat_urllib_request.urlopen(request).read()
785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
786 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
789 # Extract URL, uploader, and title from webpage
790 self.report_extraction(video_id)
791 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
793 self._downloader.trouble(u'ERROR: unable to extract media URL')
795 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
799 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
801 self._downloader.trouble(u'ERROR: unable to extract title')
803 video_title = mobj.group(1).decode('utf-8')
805 video_uploader = mobj.group(2).decode('utf-8')
808 'id': video_id.decode('utf-8'),
809 'url': video_url.decode('utf-8'),
810 'uploader': video_uploader,
812 'title': video_title,
813 'ext': video_extension.decode('utf-8'),
817 class YahooIE(InfoExtractor):
818 """Information extractor for video.yahoo.com."""
821 # _VALID_URL matches all Yahoo! Video URLs
822 # _VPAGE_URL matches only the extractable '/watch/' URLs
823 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
824 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
825 IE_NAME = u'video.yahoo'
827 def __init__(self, downloader=None):
828 InfoExtractor.__init__(self, downloader)
830 def report_download_webpage(self, video_id):
831 """Report webpage download."""
832 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
834 def report_extraction(self, video_id):
835 """Report information extraction."""
836 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
838 def _real_extract(self, url, new_video=True):
839 # Extract ID from URL
840 mobj = re.match(self._VALID_URL, url)
842 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
845 video_id = mobj.group(2)
846 video_extension = 'flv'
848 # Rewrite valid but non-extractable URLs as
849 # extractable English language /watch/ URLs
850 if re.match(self._VPAGE_URL, url) is None:
851 request = compat_urllib_request.Request(url)
853 webpage = compat_urllib_request.urlopen(request).read()
854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
855 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
858 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
860 self._downloader.trouble(u'ERROR: Unable to extract id field')
862 yahoo_id = mobj.group(1)
864 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
866 self._downloader.trouble(u'ERROR: Unable to extract vid field')
868 yahoo_vid = mobj.group(1)
870 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
871 return self._real_extract(url, new_video=False)
873 # Retrieve video webpage to extract further information
874 request = compat_urllib_request.Request(url)
876 self.report_download_webpage(video_id)
877 webpage = compat_urllib_request.urlopen(request).read()
878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
879 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
882 # Extract uploader and title from webpage
883 self.report_extraction(video_id)
884 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
886 self._downloader.trouble(u'ERROR: unable to extract video title')
888 video_title = mobj.group(1).decode('utf-8')
890 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
892 self._downloader.trouble(u'ERROR: unable to extract video uploader')
894 video_uploader = mobj.group(1).decode('utf-8')
896 # Extract video thumbnail
897 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
899 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
901 video_thumbnail = mobj.group(1).decode('utf-8')
903 # Extract video description
904 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video description')
908 video_description = mobj.group(1).decode('utf-8')
909 if not video_description:
910 video_description = 'No description available.'
912 # Extract video height and width
913 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
915 self._downloader.trouble(u'ERROR: unable to extract video height')
917 yv_video_height = mobj.group(1)
919 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
921 self._downloader.trouble(u'ERROR: unable to extract video width')
923 yv_video_width = mobj.group(1)
925 # Retrieve video playlist to extract media URL
926 # I'm not completely sure what all these options are, but we
927 # seem to need most of them, otherwise the server sends a 401.
928 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
929 yv_bitrate = '700' # according to Wikipedia this is hard-coded
930 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
931 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
932 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
934 self.report_download_webpage(video_id)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
940 # Extract media URL from playlist XML
941 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
943 self._downloader.trouble(u'ERROR: Unable to extract media URL')
945 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
946 video_url = unescapeHTML(video_url)
949 'id': video_id.decode('utf-8'),
951 'uploader': video_uploader,
953 'title': video_title,
954 'ext': video_extension.decode('utf-8'),
955 'thumbnail': video_thumbnail.decode('utf-8'),
956 'description': video_description,
960 class VimeoIE(InfoExtractor):
961 """Information extractor for vimeo.com."""
963 # _VALID_URL matches Vimeo URLs
964 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
967 def __init__(self, downloader=None):
968 InfoExtractor.__init__(self, downloader)
970 def report_download_webpage(self, video_id):
971 """Report webpage download."""
972 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
974 def report_extraction(self, video_id):
975 """Report information extraction."""
976 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
978 def _real_extract(self, url, new_video=True):
979 # Extract ID from URL
980 mobj = re.match(self._VALID_URL, url)
982 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
985 video_id = mobj.group(1)
987 # Retrieve video webpage to extract further information
988 request = compat_urllib_request.Request(url, None, std_headers)
990 self.report_download_webpage(video_id)
991 webpage_bytes = compat_urllib_request.urlopen(request).read()
992 webpage = webpage_bytes.decode('utf-8')
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
997 # Now we begin extracting as much information as we can from what we
998 # retrieved. First we extract the information common to all extractors,
999 # and latter we extract those that are Vimeo specific.
1000 self.report_extraction(video_id)
1002 # Extract the config JSON
1004 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1005 config = json.loads(config)
1007 self._downloader.trouble(u'ERROR: unable to extract info section')
1011 video_title = config["video"]["title"]
1013 # Extract uploader and uploader_id
1014 video_uploader = config["video"]["owner"]["name"]
1015 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1017 # Extract video thumbnail
1018 video_thumbnail = config["video"]["thumbnail"]
1020 # Extract video description
1021 video_description = get_element_by_attribute("itemprop", "description", webpage)
1022 if video_description: video_description = clean_html(video_description)
1023 else: video_description = ''
1025 # Extract upload date
1026 video_upload_date = None
1027 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1028 if mobj is not None:
1029 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1031 # Vimeo specific: extract request signature and timestamp
1032 sig = config['request']['signature']
1033 timestamp = config['request']['timestamp']
1035 # Vimeo specific: extract video codec and quality information
1036 # First consider quality, then codecs, then take everything
1037 # TODO bind to format param
1038 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1039 files = { 'hd': [], 'sd': [], 'other': []}
1040 for codec_name, codec_extension in codecs:
1041 if codec_name in config["video"]["files"]:
1042 if 'hd' in config["video"]["files"][codec_name]:
1043 files['hd'].append((codec_name, codec_extension, 'hd'))
1044 elif 'sd' in config["video"]["files"][codec_name]:
1045 files['sd'].append((codec_name, codec_extension, 'sd'))
1047 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1049 for quality in ('hd', 'sd', 'other'):
1050 if len(files[quality]) > 0:
1051 video_quality = files[quality][0][2]
1052 video_codec = files[quality][0][0]
1053 video_extension = files[quality][0][1]
1054 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1057 self._downloader.trouble(u'ERROR: no known codec found')
1060 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1061 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1066 'uploader': video_uploader,
1067 'uploader_id': video_uploader_id,
1068 'upload_date': video_upload_date,
1069 'title': video_title,
1070 'ext': video_extension,
1071 'thumbnail': video_thumbnail,
1072 'description': video_description,
1076 class ArteTvIE(InfoExtractor):
1077 """arte.tv information extractor."""
1079 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1080 _LIVE_URL = r'index-[0-9]+\.html$'
1082 IE_NAME = u'arte.tv'
1084 def __init__(self, downloader=None):
1085 InfoExtractor.__init__(self, downloader)
1087 def report_download_webpage(self, video_id):
1088 """Report webpage download."""
1089 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1091 def report_extraction(self, video_id):
1092 """Report information extraction."""
1093 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1095 def fetch_webpage(self, url):
1096 request = compat_urllib_request.Request(url)
1098 self.report_download_webpage(url)
1099 webpage = compat_urllib_request.urlopen(request).read()
1100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1101 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1103 except ValueError as err:
1104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1108 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1109 page = self.fetch_webpage(url)
1110 mobj = re.search(regex, page, regexFlags)
1114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1117 for (i, key, err) in matchTuples:
1118 if mobj.group(i) is None:
1119 self._downloader.trouble(err)
1122 info[key] = mobj.group(i)
1126 def extractLiveStream(self, url):
1127 video_lang = url.split('/')[-4]
1128 info = self.grep_webpage(
1130 r'src="(.*?/videothek_js.*?\.js)',
1133 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1136 http_host = url.split('/')[2]
1137 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1138 info = self.grep_webpage(
1140 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1141 '(http://.*?\.swf).*?' +
1145 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1146 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1147 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1150 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1152 def extractPlus7Stream(self, url):
1153 video_lang = url.split('/')[-3]
1154 info = self.grep_webpage(
1156 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1159 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1162 next_url = compat_urllib_parse.unquote(info.get('url'))
1163 info = self.grep_webpage(
1165 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1168 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1171 next_url = compat_urllib_parse.unquote(info.get('url'))
1173 info = self.grep_webpage(
1175 r'<video id="(.*?)".*?>.*?' +
1176 '<name>(.*?)</name>.*?' +
1177 '<dateVideo>(.*?)</dateVideo>.*?' +
1178 '<url quality="hd">(.*?)</url>',
1181 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1182 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1183 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1184 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1189 'id': info.get('id'),
1190 'url': compat_urllib_parse.unquote(info.get('url')),
1191 'uploader': u'arte.tv',
1192 'upload_date': info.get('date'),
1193 'title': info.get('title').decode('utf-8'),
1199 def _real_extract(self, url):
1200 video_id = url.split('/')[-1]
1201 self.report_extraction(video_id)
1203 if re.search(self._LIVE_URL, video_id) is not None:
1204 self.extractLiveStream(url)
1207 info = self.extractPlus7Stream(url)
1212 class GenericIE(InfoExtractor):
1213 """Generic last-resort information extractor."""
1216 IE_NAME = u'generic'
1218 def __init__(self, downloader=None):
1219 InfoExtractor.__init__(self, downloader)
1221 def report_download_webpage(self, video_id):
1222 """Report webpage download."""
1223 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1224 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1230 def report_following_redirect(self, new_url):
1231 """Report information extraction."""
1232 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1234 def _test_redirect(self, url):
1235 """Check if it is a redirect, like url shorteners, in case restart chain."""
1236 class HeadRequest(compat_urllib_request.Request):
1237 def get_method(self):
1240 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1242 Subclass the HTTPRedirectHandler to make it use our
1243 HeadRequest also on the redirected URL
1245 def redirect_request(self, req, fp, code, msg, headers, newurl):
1246 if code in (301, 302, 303, 307):
1247 newurl = newurl.replace(' ', '%20')
1248 newheaders = dict((k,v) for k,v in req.headers.items()
1249 if k.lower() not in ("content-length", "content-type"))
1250 return HeadRequest(newurl,
1252 origin_req_host=req.get_origin_req_host(),
1255 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1257 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1259 Fallback to GET if HEAD is not allowed (405 HTTP error)
1261 def http_error_405(self, req, fp, code, msg, headers):
1265 newheaders = dict((k,v) for k,v in req.headers.items()
1266 if k.lower() not in ("content-length", "content-type"))
1267 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1269 origin_req_host=req.get_origin_req_host(),
1273 opener = compat_urllib_request.OpenerDirector()
1274 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1275 HTTPMethodFallback, HEADRedirectHandler,
1276 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1277 opener.add_handler(handler())
1279 response = opener.open(HeadRequest(url))
1280 new_url = response.geturl()
1285 self.report_following_redirect(new_url)
1286 self._downloader.download([new_url])
1289 def _real_extract(self, url):
1290 if self._test_redirect(url): return
1292 video_id = url.split('/')[-1]
1293 request = compat_urllib_request.Request(url)
1295 self.report_download_webpage(video_id)
1296 webpage = compat_urllib_request.urlopen(request).read()
1297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1298 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1300 except ValueError as err:
1301 # since this is the last-resort InfoExtractor, if
1302 # this error is thrown, it'll be thrown here
1303 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306 self.report_extraction(video_id)
1307 # Start with something easy: JW Player in SWFObject
1308 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1310 # Broaden the search a little bit
1311 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1316 # It's possible that one of the regexes
1317 # matched, but returned an empty group:
1318 if mobj.group(1) is None:
1319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1322 video_url = compat_urllib_parse.unquote(mobj.group(1))
1323 video_id = os.path.basename(video_url)
1325 # here's a fun little line of code for you:
1326 video_extension = os.path.splitext(video_id)[1][1:]
1327 video_id = os.path.splitext(video_id)[0]
1329 # it's tempting to parse this further, but you would
1330 # have to take into account all the variations like
1331 # Video Title - Site Name
1332 # Site Name | Video Title
1333 # Video Title - Tagline | Site Name
1334 # and so on and so forth; it's just not practical
1335 mobj = re.search(r'<title>(.*)</title>', webpage)
1337 self._downloader.trouble(u'ERROR: unable to extract title')
1339 video_title = mobj.group(1)
1341 # video uploader is domain name
1342 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1344 self._downloader.trouble(u'ERROR: unable to extract title')
1346 video_uploader = mobj.group(1)
1351 'uploader': video_uploader,
1352 'upload_date': None,
1353 'title': video_title,
1354 'ext': video_extension,
1358 class YoutubeSearchIE(InfoExtractor):
1359 """Information Extractor for YouTube search queries."""
1360 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1361 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1362 _max_youtube_results = 1000
1363 IE_NAME = u'youtube:search'
1365 def __init__(self, downloader=None):
1366 InfoExtractor.__init__(self, downloader)
1368 def report_download_page(self, query, pagenum):
1369 """Report attempt to download search page with given number."""
1370 query = query.decode(preferredencoding())
1371 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1373 def _real_extract(self, query):
1374 mobj = re.match(self._VALID_URL, query)
1376 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1379 prefix, query = query.split(':')
1381 query = query.encode('utf-8')
1383 self._download_n_results(query, 1)
1385 elif prefix == 'all':
1386 self._download_n_results(query, self._max_youtube_results)
1392 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1394 elif n > self._max_youtube_results:
1395 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1396 n = self._max_youtube_results
1397 self._download_n_results(query, n)
1399 except ValueError: # parsing prefix as integer fails
1400 self._download_n_results(query, 1)
1403 def _download_n_results(self, query, n):
1404 """Downloads a specified number of results for a query"""
1410 while (50 * pagenum) < limit:
1411 self.report_download_page(query, pagenum+1)
1412 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1413 request = compat_urllib_request.Request(result_url)
1415 data = compat_urllib_request.urlopen(request).read()
1416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1417 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1419 api_response = json.loads(data)['data']
1421 new_ids = list(video['id'] for video in api_response['items'])
1422 video_ids += new_ids
1424 limit = min(n, api_response['totalItems'])
1427 if len(video_ids) > n:
1428 video_ids = video_ids[:n]
1429 for id in video_ids:
1430 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1434 class GoogleSearchIE(InfoExtractor):
1435 """Information Extractor for Google Video search queries."""
1436 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1437 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1438 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1439 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1440 _max_google_results = 1000
1441 IE_NAME = u'video.google:search'
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1446 def report_download_page(self, query, pagenum):
1447 """Report attempt to download playlist page with given number."""
1448 query = query.decode(preferredencoding())
1449 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1451 def _real_extract(self, query):
1452 mobj = re.match(self._VALID_URL, query)
1454 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1457 prefix, query = query.split(':')
1459 query = query.encode('utf-8')
1461 self._download_n_results(query, 1)
1463 elif prefix == 'all':
1464 self._download_n_results(query, self._max_google_results)
1470 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1472 elif n > self._max_google_results:
1473 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1474 n = self._max_google_results
1475 self._download_n_results(query, n)
1477 except ValueError: # parsing prefix as integer fails
1478 self._download_n_results(query, 1)
1481 def _download_n_results(self, query, n):
1482 """Downloads a specified number of results for a query"""
1488 self.report_download_page(query, pagenum)
1489 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1490 request = compat_urllib_request.Request(result_url)
1492 page = compat_urllib_request.urlopen(request).read()
1493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1497 # Extract video identifiers
1498 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1499 video_id = mobj.group(1)
1500 if video_id not in video_ids:
1501 video_ids.append(video_id)
1502 if len(video_ids) == n:
1503 # Specified n videos reached
1504 for id in video_ids:
1505 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1508 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1509 for id in video_ids:
1510 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1513 pagenum = pagenum + 1
1516 class YahooSearchIE(InfoExtractor):
1517 """Information Extractor for Yahoo! Video search queries."""
1520 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1521 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1522 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1523 _MORE_PAGES_INDICATOR = r'\s*Next'
1524 _max_yahoo_results = 1000
1525 IE_NAME = u'video.yahoo:search'
1527 def __init__(self, downloader=None):
1528 InfoExtractor.__init__(self, downloader)
1530 def report_download_page(self, query, pagenum):
1531 """Report attempt to download playlist page with given number."""
1532 query = query.decode(preferredencoding())
1533 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1535 def _real_extract(self, query):
1536 mobj = re.match(self._VALID_URL, query)
1538 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1541 prefix, query = query.split(':')
1543 query = query.encode('utf-8')
1545 self._download_n_results(query, 1)
1547 elif prefix == 'all':
1548 self._download_n_results(query, self._max_yahoo_results)
1554 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1556 elif n > self._max_yahoo_results:
1557 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1558 n = self._max_yahoo_results
1559 self._download_n_results(query, n)
1561 except ValueError: # parsing prefix as integer fails
1562 self._download_n_results(query, 1)
1565 def _download_n_results(self, query, n):
1566 """Downloads a specified number of results for a query"""
1569 already_seen = set()
1573 self.report_download_page(query, pagenum)
1574 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1575 request = compat_urllib_request.Request(result_url)
1577 page = compat_urllib_request.urlopen(request).read()
1578 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1579 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1582 # Extract video identifiers
1583 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1584 video_id = mobj.group(1)
1585 if video_id not in already_seen:
1586 video_ids.append(video_id)
1587 already_seen.add(video_id)
1588 if len(video_ids) == n:
1589 # Specified n videos reached
1590 for id in video_ids:
1591 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1594 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1595 for id in video_ids:
1596 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1599 pagenum = pagenum + 1
1602 class YoutubePlaylistIE(InfoExtractor):
1603 """Information Extractor for YouTube playlists."""
1605 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1606 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1607 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1608 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1609 IE_NAME = u'youtube:playlist'
1611 def __init__(self, downloader=None):
1612 InfoExtractor.__init__(self, downloader)
1614 def report_download_page(self, playlist_id, pagenum):
1615 """Report attempt to download playlist page with given number."""
1616 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1618 def _real_extract(self, url):
1619 # Extract playlist id
1620 mobj = re.match(self._VALID_URL, url)
1622 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1626 if mobj.group(3) is not None:
1627 self._downloader.download([mobj.group(3)])
1630 # Download playlist pages
1631 # prefix is 'p' as default for playlists but there are other types that need extra care
1632 playlist_prefix = mobj.group(1)
1633 if playlist_prefix == 'a':
1634 playlist_access = 'artist'
1636 playlist_prefix = 'p'
1637 playlist_access = 'view_play_list'
1638 playlist_id = mobj.group(2)
1643 self.report_download_page(playlist_id, pagenum)
1644 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1645 request = compat_urllib_request.Request(url)
1647 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1649 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1652 # Extract video identifiers
1654 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1655 if mobj.group(1) not in ids_in_page:
1656 ids_in_page.append(mobj.group(1))
1657 video_ids.extend(ids_in_page)
1659 if self._MORE_PAGES_INDICATOR not in page:
1661 pagenum = pagenum + 1
1663 total = len(video_ids)
1665 playliststart = self._downloader.params.get('playliststart', 1) - 1
1666 playlistend = self._downloader.params.get('playlistend', -1)
1667 if playlistend == -1:
1668 video_ids = video_ids[playliststart:]
1670 video_ids = video_ids[playliststart:playlistend]
1672 if len(video_ids) == total:
1673 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1675 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1677 for id in video_ids:
1678 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1682 class YoutubeChannelIE(InfoExtractor):
1683 """Information Extractor for YouTube channels."""
1685 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1686 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1687 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1688 IE_NAME = u'youtube:channel'
1690 def report_download_page(self, channel_id, pagenum):
1691 """Report attempt to download channel page with given number."""
1692 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1694 def _real_extract(self, url):
1695 # Extract channel id
1696 mobj = re.match(self._VALID_URL, url)
1698 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1701 # Download channel pages
1702 channel_id = mobj.group(1)
1707 self.report_download_page(channel_id, pagenum)
1708 url = self._TEMPLATE_URL % (channel_id, pagenum)
1709 request = compat_urllib_request.Request(url)
1711 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1718 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1723 if self._MORE_PAGES_INDICATOR not in page:
1725 pagenum = pagenum + 1
1727 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1729 for id in video_ids:
1730 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1734 class YoutubeUserIE(InfoExtractor):
1735 """Information Extractor for YouTube users."""
1737 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1738 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1739 _GDATA_PAGE_SIZE = 50
1740 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1741 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1742 IE_NAME = u'youtube:user'
1744 def __init__(self, downloader=None):
1745 InfoExtractor.__init__(self, downloader)
1747 def report_download_page(self, username, start_index):
1748 """Report attempt to download user page."""
1749 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1750 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1752 def _real_extract(self, url):
1754 mobj = re.match(self._VALID_URL, url)
1756 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1759 username = mobj.group(1)
1761 # Download video ids using YouTube Data API. Result size per
1762 # query is limited (currently to 50 videos) so we need to query
1763 # page by page until there are no video ids - it means we got
1770 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1771 self.report_download_page(username, start_index)
1773 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1776 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1781 # Extract video identifiers
1784 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1785 if mobj.group(1) not in ids_in_page:
1786 ids_in_page.append(mobj.group(1))
1788 video_ids.extend(ids_in_page)
1790 # A little optimization - if current page is not
1791 # "full", ie. does not contain PAGE_SIZE video ids then
1792 # we can assume that this page is the last one - there
1793 # are no more ids on further pages - no need to query
1796 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1801 all_ids_count = len(video_ids)
1802 playliststart = self._downloader.params.get('playliststart', 1) - 1
1803 playlistend = self._downloader.params.get('playlistend', -1)
1805 if playlistend == -1:
1806 video_ids = video_ids[playliststart:]
1808 video_ids = video_ids[playliststart:playlistend]
1810 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1811 (username, all_ids_count, len(video_ids)))
1813 for video_id in video_ids:
1814 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1817 class BlipTVUserIE(InfoExtractor):
1818 """Information Extractor for blip.tv users."""
1820 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1822 IE_NAME = u'blip.tv:user'
1824 def __init__(self, downloader=None):
1825 InfoExtractor.__init__(self, downloader)
1827 def report_download_page(self, username, pagenum):
1828 """Report attempt to download user page."""
1829 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1830 (self.IE_NAME, username, pagenum))
1832 def _real_extract(self, url):
1834 mobj = re.match(self._VALID_URL, url)
1836 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1839 username = mobj.group(1)
1841 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1843 request = compat_urllib_request.Request(url)
1846 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1847 mobj = re.search(r'data-users-id="([^"]+)"', page)
1848 page_base = page_base % mobj.group(1)
1849 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1850 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1854 # Download video ids using BlipTV Ajax calls. Result size per
1855 # query is limited (currently to 12 videos) so we need to query
1856 # page by page until there are no video ids - it means we got
1863 self.report_download_page(username, pagenum)
1865 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1868 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1869 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1870 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1873 # Extract video identifiers
1876 for mobj in re.finditer(r'href="/([^"]+)"', page):
1877 if mobj.group(1) not in ids_in_page:
1878 ids_in_page.append(unescapeHTML(mobj.group(1)))
1880 video_ids.extend(ids_in_page)
1882 # A little optimization - if current page is not
1883 # "full", ie. does not contain PAGE_SIZE video ids then
1884 # we can assume that this page is the last one - there
1885 # are no more ids on further pages - no need to query
1888 if len(ids_in_page) < self._PAGE_SIZE:
1893 all_ids_count = len(video_ids)
1894 playliststart = self._downloader.params.get('playliststart', 1) - 1
1895 playlistend = self._downloader.params.get('playlistend', -1)
1897 if playlistend == -1:
1898 video_ids = video_ids[playliststart:]
1900 video_ids = video_ids[playliststart:playlistend]
1902 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1903 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1905 for video_id in video_ids:
1906 self._downloader.download([u'http://blip.tv/'+video_id])
1909 class DepositFilesIE(InfoExtractor):
1910 """Information extractor for depositfiles.com"""
1912 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1913 IE_NAME = u'DepositFiles'
1915 def __init__(self, downloader=None):
1916 InfoExtractor.__init__(self, downloader)
1918 def report_download_webpage(self, file_id):
1919 """Report webpage download."""
1920 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1922 def report_extraction(self, file_id):
1923 """Report information extraction."""
1924 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1926 def _real_extract(self, url):
1927 file_id = url.split('/')[-1]
1928 # Rebuild url in english locale
1929 url = 'http://depositfiles.com/en/files/' + file_id
1931 # Retrieve file webpage with 'Free download' button pressed
1932 free_download_indication = { 'gateway_result' : '1' }
1933 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1935 self.report_download_webpage(file_id)
1936 webpage = compat_urllib_request.urlopen(request).read()
1937 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1941 # Search for the real file URL
1942 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1943 if (mobj is None) or (mobj.group(1) is None):
1944 # Try to figure out reason of the error.
1945 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1946 if (mobj is not None) and (mobj.group(1) is not None):
1947 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1948 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1950 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1953 file_url = mobj.group(1)
1954 file_extension = os.path.splitext(file_url)[1][1:]
1956 # Search for file title
1957 mobj = re.search(r'<b title="(.*?)">', webpage)
1959 self._downloader.trouble(u'ERROR: unable to extract title')
1961 file_title = mobj.group(1).decode('utf-8')
1964 'id': file_id.decode('utf-8'),
1965 'url': file_url.decode('utf-8'),
1967 'upload_date': None,
1968 'title': file_title,
1969 'ext': file_extension.decode('utf-8'),
1973 class FacebookIE(InfoExtractor):
1974 """Information Extractor for Facebook"""
1977 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1978 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1979 _NETRC_MACHINE = 'facebook'
1980 _available_formats = ['video', 'highqual', 'lowqual']
1981 _video_extensions = {
1986 IE_NAME = u'facebook'
1988 def __init__(self, downloader=None):
1989 InfoExtractor.__init__(self, downloader)
1991 def _reporter(self, message):
1992 """Add header and report message."""
1993 self._downloader.to_screen(u'[facebook] %s' % message)
1995 def report_login(self):
1996 """Report attempt to log in."""
1997 self._reporter(u'Logging in')
1999 def report_video_webpage_download(self, video_id):
2000 """Report attempt to download video webpage."""
2001 self._reporter(u'%s: Downloading video webpage' % video_id)
2003 def report_information_extraction(self, video_id):
2004 """Report attempt to extract video information."""
2005 self._reporter(u'%s: Extracting video information' % video_id)
2007 def _parse_page(self, video_webpage):
2008 """Extract video information from page"""
2010 data = {'title': r'\("video_title", "(.*?)"\)',
2011 'description': r'<div class="datawrap">(.*?)</div>',
2012 'owner': r'\("video_owner_name", "(.*?)"\)',
2013 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2016 for piece in data.keys():
2017 mobj = re.search(data[piece], video_webpage)
2018 if mobj is not None:
2019 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2023 for fmt in self._available_formats:
2024 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2025 if mobj is not None:
2026 # URL is in a Javascript segment inside an escaped Unicode format within
2027 # the generally utf-8 page
2028 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2029 video_info['video_urls'] = video_urls
2033 def _real_initialize(self):
2034 if self._downloader is None:
2039 downloader_params = self._downloader.params
2041 # Attempt to use provided username and password or .netrc data
2042 if downloader_params.get('username', None) is not None:
2043 useremail = downloader_params['username']
2044 password = downloader_params['password']
2045 elif downloader_params.get('usenetrc', False):
2047 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2048 if info is not None:
2052 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2053 except (IOError, netrc.NetrcParseError) as err:
2054 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2057 if useremail is None:
2066 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2069 login_results = compat_urllib_request.urlopen(request).read()
2070 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2071 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2073 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2074 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2077 def _real_extract(self, url):
2078 mobj = re.match(self._VALID_URL, url)
2080 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2082 video_id = mobj.group('ID')
2085 self.report_video_webpage_download(video_id)
2086 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2088 page = compat_urllib_request.urlopen(request)
2089 video_webpage = page.read()
2090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2091 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2094 # Start extracting information
2095 self.report_information_extraction(video_id)
2097 # Extract information
2098 video_info = self._parse_page(video_webpage)
2101 if 'owner' not in video_info:
2102 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2104 video_uploader = video_info['owner']
2107 if 'title' not in video_info:
2108 self._downloader.trouble(u'ERROR: unable to extract video title')
2110 video_title = video_info['title']
2111 video_title = video_title.decode('utf-8')
2114 if 'thumbnail' not in video_info:
2115 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2116 video_thumbnail = ''
2118 video_thumbnail = video_info['thumbnail']
2122 if 'upload_date' in video_info:
2123 upload_time = video_info['upload_date']
2124 timetuple = email.utils.parsedate_tz(upload_time)
2125 if timetuple is not None:
2127 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2132 video_description = video_info.get('description', 'No description available.')
2134 url_map = video_info['video_urls']
2136 # Decide which formats to download
2137 req_format = self._downloader.params.get('format', None)
2138 format_limit = self._downloader.params.get('format_limit', None)
2140 if format_limit is not None and format_limit in self._available_formats:
2141 format_list = self._available_formats[self._available_formats.index(format_limit):]
2143 format_list = self._available_formats
2144 existing_formats = [x for x in format_list if x in url_map]
2145 if len(existing_formats) == 0:
2146 self._downloader.trouble(u'ERROR: no known formats available for video')
2148 if req_format is None:
2149 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2150 elif req_format == 'worst':
2151 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2152 elif req_format == '-1':
2153 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2156 if req_format not in url_map:
2157 self._downloader.trouble(u'ERROR: requested format not available')
2159 video_url_list = [(req_format, url_map[req_format])] # Specific format
2162 for format_param, video_real_url in video_url_list:
2164 video_extension = self._video_extensions.get(format_param, 'mp4')
2167 'id': video_id.decode('utf-8'),
2168 'url': video_real_url.decode('utf-8'),
2169 'uploader': video_uploader.decode('utf-8'),
2170 'upload_date': upload_date,
2171 'title': video_title,
2172 'ext': video_extension.decode('utf-8'),
2173 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2174 'thumbnail': video_thumbnail.decode('utf-8'),
2175 'description': video_description.decode('utf-8'),
2179 class BlipTVIE(InfoExtractor):
2180 """Information extractor for blip.tv"""
2182 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2183 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2184 IE_NAME = u'blip.tv'
2186 def report_extraction(self, file_id):
2187 """Report information extraction."""
2188 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2190 def report_direct_download(self, title):
2191 """Report information extraction."""
2192 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2194 def _real_extract(self, url):
2195 mobj = re.match(self._VALID_URL, url)
2197 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2204 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2205 request = compat_urllib_request.Request(json_url)
2206 self.report_extraction(mobj.group(1))
2209 urlh = compat_urllib_request.urlopen(request)
2210 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2211 basename = url.split('/')[-1]
2212 title,ext = os.path.splitext(basename)
2213 title = title.decode('UTF-8')
2214 ext = ext.replace('.', '')
2215 self.report_direct_download(title)
2220 'upload_date': None,
2225 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2226 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2228 if info is None: # Regular URL
2230 json_code_bytes = urlh.read()
2231 json_code = json_code_bytes.decode('utf-8')
2232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2233 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2237 json_data = json.loads(json_code)
2238 if 'Post' in json_data:
2239 data = json_data['Post']
2243 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2244 video_url = data['media']['url']
2245 umobj = re.match(self._URL_EXT, video_url)
2247 raise ValueError('Can not determine filename extension')
2248 ext = umobj.group(1)
2251 'id': data['item_id'],
2253 'uploader': data['display_name'],
2254 'upload_date': upload_date,
2255 'title': data['title'],
2257 'format': data['media']['mimeType'],
2258 'thumbnail': data['thumbnailUrl'],
2259 'description': data['description'],
2260 'player_url': data['embedUrl']
2262 except (ValueError,KeyError) as err:
2263 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2266 std_headers['User-Agent'] = 'iTunes/10.6.1'
2270 class MyVideoIE(InfoExtractor):
2271 """Information Extractor for myvideo.de."""
2273 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2274 IE_NAME = u'myvideo'
2276 def __init__(self, downloader=None):
2277 InfoExtractor.__init__(self, downloader)
2279 def report_download_webpage(self, video_id):
2280 """Report webpage download."""
2281 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2283 def report_extraction(self, video_id):
2284 """Report information extraction."""
2285 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2287 def _real_extract(self,url):
2288 mobj = re.match(self._VALID_URL, url)
2290 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2293 video_id = mobj.group(1)
2296 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2298 self.report_download_webpage(video_id)
2299 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2301 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2304 self.report_extraction(video_id)
2305 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2308 self._downloader.trouble(u'ERROR: unable to extract media URL')
2310 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2312 mobj = re.search('<title>([^<]+)</title>', webpage)
2314 self._downloader.trouble(u'ERROR: unable to extract title')
2317 video_title = mobj.group(1)
2323 'upload_date': None,
2324 'title': video_title,
2328 class ComedyCentralIE(InfoExtractor):
2329 """Information extractor for The Daily Show and Colbert Report """
2331 # urls can be abbreviations like :thedailyshow or :colbert
2332 # urls for episodes like:
2333 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2334 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2335 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2336 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2337 |(https?://)?(www\.)?
2338 (?P<showname>thedailyshow|colbertnation)\.com/
2339 (full-episodes/(?P<episode>.*)|
2341 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2342 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2344 IE_NAME = u'comedycentral'
2346 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2348 _video_extensions = {
2356 _video_dimensions = {
2365 def suitable(self, url):
2366 """Receives a URL and returns True if suitable for this IE."""
2367 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2369 def report_extraction(self, episode_id):
2370 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2372 def report_config_download(self, episode_id):
2373 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2375 def report_index_download(self, episode_id):
2376 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2378 def report_player_url(self, episode_id):
2379 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2382 def _print_formats(self, formats):
2383 print('Available formats:')
2385 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2388 def _real_extract(self, url):
2389 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2391 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2394 if mobj.group('shortname'):
2395 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2396 url = u'http://www.thedailyshow.com/full-episodes/'
2398 url = u'http://www.colbertnation.com/full-episodes/'
2399 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2400 assert mobj is not None
2402 if mobj.group('clip'):
2403 if mobj.group('showname') == 'thedailyshow':
2404 epTitle = mobj.group('tdstitle')
2406 epTitle = mobj.group('cntitle')
2409 dlNewest = not mobj.group('episode')
2411 epTitle = mobj.group('showname')
2413 epTitle = mobj.group('episode')
2415 req = compat_urllib_request.Request(url)
2416 self.report_extraction(epTitle)
2418 htmlHandle = compat_urllib_request.urlopen(req)
2419 html = htmlHandle.read()
2420 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2421 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2424 url = htmlHandle.geturl()
2425 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2427 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2429 if mobj.group('episode') == '':
2430 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2432 epTitle = mobj.group('episode')
2434 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2436 if len(mMovieParams) == 0:
2437 # The Colbert Report embeds the information in a without
2438 # a URL prefix; so extract the alternate reference
2439 # and then add the URL prefix manually.
2441 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2442 if len(altMovieParams) == 0:
2443 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2446 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2448 playerUrl_raw = mMovieParams[0][0]
2449 self.report_player_url(epTitle)
2451 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2452 playerUrl = urlHandle.geturl()
2453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2454 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2457 uri = mMovieParams[0][1]
2458 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2459 self.report_index_download(epTitle)
2461 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2463 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2468 idoc = xml.etree.ElementTree.fromstring(indexXml)
2469 itemEls = idoc.findall('.//item')
2470 for itemEl in itemEls:
2471 mediaId = itemEl.findall('./guid')[0].text
2472 shortMediaId = mediaId.split(':')[-1]
2473 showId = mediaId.split(':')[-2].replace('.com', '')
2474 officialTitle = itemEl.findall('./title')[0].text
2475 officialDate = itemEl.findall('./pubDate')[0].text
2477 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2478 compat_urllib_parse.urlencode({'uri': mediaId}))
2479 configReq = compat_urllib_request.Request(configUrl)
2480 self.report_config_download(epTitle)
2482 configXml = compat_urllib_request.urlopen(configReq).read()
2483 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2484 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2487 cdoc = xml.etree.ElementTree.fromstring(configXml)
2489 for rendition in cdoc.findall('.//rendition'):
2490 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2494 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2497 if self._downloader.params.get('listformats', None):
2498 self._print_formats([i[0] for i in turls])
2501 # For now, just pick the highest bitrate
2502 format,video_url = turls[-1]
2504 # Get the format arg from the arg stream
2505 req_format = self._downloader.params.get('format', None)
2507 # Select format if we can find one
2510 format, video_url = f, v
2513 # Patch to download from alternative CDN, which does not
2514 # break on current RTMPDump builds
2515 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2516 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2518 if video_url.startswith(broken_cdn):
2519 video_url = video_url.replace(broken_cdn, better_cdn)
2521 effTitle = showId + u'-' + epTitle
2526 'upload_date': officialDate,
2531 'description': officialTitle,
2532 'player_url': None #playerUrl
2535 results.append(info)
2540 class EscapistIE(InfoExtractor):
2541 """Information extractor for The Escapist """
2543 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2544 IE_NAME = u'escapist'
2546 def report_extraction(self, showName):
2547 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2549 def report_config_download(self, showName):
2550 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2552 def _real_extract(self, url):
2553 mobj = re.match(self._VALID_URL, url)
2555 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2557 showName = mobj.group('showname')
2558 videoId = mobj.group('episode')
2560 self.report_extraction(showName)
2562 webPage = compat_urllib_request.urlopen(url)
2563 webPageBytes = webPage.read()
2564 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2565 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2566 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2567 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2570 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2571 description = unescapeHTML(descMatch.group(1))
2572 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2573 imgUrl = unescapeHTML(imgMatch.group(1))
2574 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2575 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2576 configUrlMatch = re.search('config=(.*)$', playerUrl)
2577 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2579 self.report_config_download(showName)
2581 configJSON = compat_urllib_request.urlopen(configUrl)
2582 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2583 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2584 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2585 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2588 # Technically, it's JavaScript, not JSON
2589 configJSON = configJSON.replace("'", '"')
2592 config = json.loads(configJSON)
2593 except (ValueError,) as err:
2594 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2597 playlist = config['playlist']
2598 videoUrl = playlist[1]['url']
2603 'uploader': showName,
2604 'upload_date': None,
2607 'thumbnail': imgUrl,
2608 'description': description,
2609 'player_url': playerUrl,
2615 class CollegeHumorIE(InfoExtractor):
2616 """Information extractor for collegehumor.com"""
2619 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2620 IE_NAME = u'collegehumor'
2622 def report_manifest(self, video_id):
2623 """Report information extraction."""
2624 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2626 def report_extraction(self, video_id):
2627 """Report information extraction."""
2628 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2630 def _real_extract(self, url):
2631 mobj = re.match(self._VALID_URL, url)
2633 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2635 video_id = mobj.group('videoid')
2640 'upload_date': None,
2643 self.report_extraction(video_id)
2644 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2646 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2651 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2653 videoNode = mdoc.findall('./video')[0]
2654 info['description'] = videoNode.findall('./description')[0].text
2655 info['title'] = videoNode.findall('./caption')[0].text
2656 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2657 manifest_url = videoNode.findall('./file')[0].text
2659 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2662 manifest_url += '?hdcore=2.10.3'
2663 self.report_manifest(video_id)
2665 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2666 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2667 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2670 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2672 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2673 node_id = media_node.attrib['url']
2674 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2675 except IndexError as err:
2676 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2679 url_pr = compat_urllib_parse_urlparse(manifest_url)
2680 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2687 class XVideosIE(InfoExtractor):
2688 """Information extractor for xvideos.com"""
2690 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2691 IE_NAME = u'xvideos'
2693 def report_webpage(self, video_id):
2694 """Report information extraction."""
2695 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2697 def report_extraction(self, video_id):
2698 """Report information extraction."""
2699 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2701 def _real_extract(self, url):
2702 mobj = re.match(self._VALID_URL, url)
2704 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2706 video_id = mobj.group(1)
2708 self.report_webpage(video_id)
2710 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2712 webpage_bytes = compat_urllib_request.urlopen(request).read()
2713 webpage = webpage_bytes.decode('utf-8', 'replace')
2714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2715 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2718 self.report_extraction(video_id)
2722 mobj = re.search(r'flv_url=(.+?)&', webpage)
2724 self._downloader.trouble(u'ERROR: unable to extract video url')
2726 video_url = compat_urllib_parse.unquote(mobj.group(1))
2730 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2732 self._downloader.trouble(u'ERROR: unable to extract video title')
2734 video_title = mobj.group(1)
2737 # Extract video thumbnail
2738 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2740 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2742 video_thumbnail = mobj.group(0)
2748 'upload_date': None,
2749 'title': video_title,
2751 'thumbnail': video_thumbnail,
2752 'description': None,
2758 class SoundcloudIE(InfoExtractor):
2759 """Information extractor for soundcloud.com
2760 To access the media, the uid of the song and a stream token
2761 must be extracted from the page source and the script must make
2762 a request to media.soundcloud.com/crossdomain.xml. Then
2763 the media can be grabbed by requesting from an url composed
2764 of the stream token and uid
2767 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2768 IE_NAME = u'soundcloud'
2770 def __init__(self, downloader=None):
2771 InfoExtractor.__init__(self, downloader)
2773 def report_resolve(self, video_id):
2774 """Report information extraction."""
2775 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2777 def report_extraction(self, video_id):
2778 """Report information extraction."""
2779 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2781 def _real_extract(self, url):
2782 mobj = re.match(self._VALID_URL, url)
2784 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2787 # extract uploader (which is in the url)
2788 uploader = mobj.group(1)
2789 # extract simple title (uploader + slug of song title)
2790 slug_title = mobj.group(2)
2791 simple_title = uploader + u'-' + slug_title
2793 self.report_resolve('%s/%s' % (uploader, slug_title))
2795 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2796 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2797 request = compat_urllib_request.Request(resolv_url)
2799 info_json_bytes = compat_urllib_request.urlopen(request).read()
2800 info_json = info_json_bytes.decode('utf-8')
2801 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2802 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2805 info = json.loads(info_json)
2806 video_id = info['id']
2807 self.report_extraction('%s/%s' % (uploader, slug_title))
2809 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810 request = compat_urllib_request.Request(streams_url)
2812 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2813 stream_json = stream_json_bytes.decode('utf-8')
2814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2818 streams = json.loads(stream_json)
2819 mediaURL = streams['http_mp3_128_url']
2824 'uploader': info['user']['username'],
2825 'upload_date': info['created_at'],
2826 'title': info['title'],
2828 'description': info['description'],
2832 class InfoQIE(InfoExtractor):
2833 """Information extractor for infoq.com"""
2835 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2838 def report_webpage(self, video_id):
2839 """Report information extraction."""
2840 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2842 def report_extraction(self, video_id):
2843 """Report information extraction."""
2844 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2846 def _real_extract(self, url):
2847 mobj = re.match(self._VALID_URL, url)
2849 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2852 self.report_webpage(url)
2854 request = compat_urllib_request.Request(url)
2856 webpage = compat_urllib_request.urlopen(request).read()
2857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2858 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2861 self.report_extraction(url)
2865 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2867 self._downloader.trouble(u'ERROR: unable to extract video url')
2869 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2873 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2875 self._downloader.trouble(u'ERROR: unable to extract video title')
2877 video_title = mobj.group(1).decode('utf-8')
2879 # Extract description
2880 video_description = u'No description available.'
2881 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2882 if mobj is not None:
2883 video_description = mobj.group(1).decode('utf-8')
2885 video_filename = video_url.split('/')[-1]
2886 video_id, extension = video_filename.split('.')
2892 'upload_date': None,
2893 'title': video_title,
2894 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2896 'description': video_description,
2901 class MixcloudIE(InfoExtractor):
2902 """Information extractor for www.mixcloud.com"""
2904 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2905 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2906 IE_NAME = u'mixcloud'
2908 def __init__(self, downloader=None):
2909 InfoExtractor.__init__(self, downloader)
2911 def report_download_json(self, file_id):
2912 """Report JSON download."""
2913 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2915 def report_extraction(self, file_id):
2916 """Report information extraction."""
2917 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2919 def get_urls(self, jsonData, fmt, bitrate='best'):
2920 """Get urls from 'audio_formats' section in json"""
2923 bitrate_list = jsonData[fmt]
2924 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2925 bitrate = max(bitrate_list) # select highest
2927 url_list = jsonData[fmt][bitrate]
2928 except TypeError: # we have no bitrate info.
2929 url_list = jsonData[fmt]
2932 def check_urls(self, url_list):
2933 """Returns 1st active url from list"""
2934 for url in url_list:
2936 compat_urllib_request.urlopen(url)
2938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2943 def _print_formats(self, formats):
2944 print('Available formats:')
2945 for fmt in formats.keys():
2946 for b in formats[fmt]:
2948 ext = formats[fmt][b][0]
2949 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2950 except TypeError: # we have no bitrate info
2951 ext = formats[fmt][0]
2952 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2955 def _real_extract(self, url):
2956 mobj = re.match(self._VALID_URL, url)
2958 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2960 # extract uploader & filename from url
2961 uploader = mobj.group(1).decode('utf-8')
2962 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2964 # construct API request
2965 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2966 # retrieve .json file with links to files
2967 request = compat_urllib_request.Request(file_url)
2969 self.report_download_json(file_url)
2970 jsonData = compat_urllib_request.urlopen(request).read()
2971 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2972 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2976 json_data = json.loads(jsonData)
2977 player_url = json_data['player_swf_url']
2978 formats = dict(json_data['audio_formats'])
2980 req_format = self._downloader.params.get('format', None)
2983 if self._downloader.params.get('listformats', None):
2984 self._print_formats(formats)
2987 if req_format is None or req_format == 'best':
2988 for format_param in formats.keys():
2989 url_list = self.get_urls(formats, format_param)
2991 file_url = self.check_urls(url_list)
2992 if file_url is not None:
2995 if req_format not in formats:
2996 self._downloader.trouble(u'ERROR: format is not available')
2999 url_list = self.get_urls(formats, req_format)
3000 file_url = self.check_urls(url_list)
3001 format_param = req_format
3004 'id': file_id.decode('utf-8'),
3005 'url': file_url.decode('utf-8'),
3006 'uploader': uploader.decode('utf-8'),
3007 'upload_date': None,
3008 'title': json_data['name'],
3009 'ext': file_url.split('.')[-1].decode('utf-8'),
3010 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3011 'thumbnail': json_data['thumbnail_url'],
3012 'description': json_data['description'],
3013 'player_url': player_url.decode('utf-8'),
3016 class StanfordOpenClassroomIE(InfoExtractor):
3017 """Information extractor for Stanford's Open ClassRoom"""
3019 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3020 IE_NAME = u'stanfordoc'
3022 def report_download_webpage(self, objid):
3023 """Report information extraction."""
3024 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3026 def report_extraction(self, video_id):
3027 """Report information extraction."""
3028 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3030 def _real_extract(self, url):
3031 mobj = re.match(self._VALID_URL, url)
3033 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3036 if mobj.group('course') and mobj.group('video'): # A specific video
3037 course = mobj.group('course')
3038 video = mobj.group('video')
3040 'id': course + '_' + video,
3042 'upload_date': None,
3045 self.report_extraction(info['id'])
3046 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3047 xmlUrl = baseUrl + video + '.xml'
3049 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3051 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3053 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3055 info['title'] = mdoc.findall('./title')[0].text
3056 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3058 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3060 info['ext'] = info['url'].rpartition('.')[2]
3062 elif mobj.group('course'): # A course page
3063 course = mobj.group('course')
3068 'upload_date': None,
3071 self.report_download_webpage(info['id'])
3073 coursepage = compat_urllib_request.urlopen(url).read()
3074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3075 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3078 m = re.search('<h1>([^<]+)</h1>', coursepage)
3080 info['title'] = unescapeHTML(m.group(1))
3082 info['title'] = info['id']
3084 m = re.search('<description>([^<]+)</description>', coursepage)
3086 info['description'] = unescapeHTML(m.group(1))
3088 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3091 'type': 'reference',
3092 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3096 for entry in info['list']:
3097 assert entry['type'] == 'reference'
3098 results += self.extract(entry['url'])
3103 'id': 'Stanford OpenClassroom',
3106 'upload_date': None,
3109 self.report_download_webpage(info['id'])
3110 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3112 rootpage = compat_urllib_request.urlopen(rootURL).read()
3113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3114 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3117 info['title'] = info['id']
3119 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3122 'type': 'reference',
3123 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3128 for entry in info['list']:
3129 assert entry['type'] == 'reference'
3130 results += self.extract(entry['url'])
3133 class MTVIE(InfoExtractor):
3134 """Information extractor for MTV.com"""
3136 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3139 def report_webpage(self, video_id):
3140 """Report information extraction."""
3141 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3143 def report_extraction(self, video_id):
3144 """Report information extraction."""
3145 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3147 def _real_extract(self, url):
3148 mobj = re.match(self._VALID_URL, url)
3150 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3152 if not mobj.group('proto'):
3153 url = 'http://' + url
3154 video_id = mobj.group('videoid')
3155 self.report_webpage(video_id)
3157 request = compat_urllib_request.Request(url)
3159 webpage = compat_urllib_request.urlopen(request).read()
3160 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3161 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3164 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3166 self._downloader.trouble(u'ERROR: unable to extract song name')
3168 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3169 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3171 self._downloader.trouble(u'ERROR: unable to extract performer')
3173 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3174 video_title = performer + ' - ' + song_name
3176 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3178 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3180 mtvn_uri = mobj.group(1)
3182 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3184 self._downloader.trouble(u'ERROR: unable to extract content id')
3186 content_id = mobj.group(1)
3188 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3189 self.report_extraction(video_id)
3190 request = compat_urllib_request.Request(videogen_url)
3192 metadataXml = compat_urllib_request.urlopen(request).read()
3193 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3194 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3197 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3198 renditions = mdoc.findall('.//rendition')
3200 # For now, always pick the highest quality.
3201 rendition = renditions[-1]
3204 _,_,ext = rendition.attrib['type'].partition('/')
3205 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3206 video_url = rendition.find('./src').text
3208 self._downloader.trouble('Invalid rendition field.')
3214 'uploader': performer,
3215 'upload_date': None,
3216 'title': video_title,
3224 class YoukuIE(InfoExtractor):
3226 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3229 def __init__(self, downloader=None):
3230 InfoExtractor.__init__(self, downloader)
3232 def report_download_webpage(self, file_id):
3233 """Report webpage download."""
3234 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3236 def report_extraction(self, file_id):
3237 """Report information extraction."""
3238 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3241 nowTime = int(time.time() * 1000)
3242 random1 = random.randint(1000,1998)
3243 random2 = random.randint(1000,9999)
3245 return "%d%d%d" %(nowTime,random1,random2)
3247 def _get_file_ID_mix_string(self, seed):
3249 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3251 for i in range(len(source)):
3252 seed = (seed * 211 + 30031 ) % 65536
3253 index = math.floor(seed / 65536 * len(source) )
3254 mixed.append(source[int(index)])
3255 source.remove(source[int(index)])
3256 #return ''.join(mixed)
3259 def _get_file_id(self, fileId, seed):
3260 mixed = self._get_file_ID_mix_string(seed)
3261 ids = fileId.split('*')
3265 realId.append(mixed[int(ch)])
3266 return ''.join(realId)
3268 def _real_extract(self, url):
3269 mobj = re.match(self._VALID_URL, url)
3271 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3273 video_id = mobj.group('ID')
3275 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3277 request = compat_urllib_request.Request(info_url, None, std_headers)
3279 self.report_download_webpage(video_id)
3280 jsondata = compat_urllib_request.urlopen(request).read()
3281 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3282 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3285 self.report_extraction(video_id)
3287 jsonstr = jsondata.decode('utf-8')
3288 config = json.loads(jsonstr)
3290 video_title = config['data'][0]['title']
3291 seed = config['data'][0]['seed']
3293 format = self._downloader.params.get('format', None)
3294 supported_format = list(config['data'][0]['streamfileids'].keys())
3296 if format is None or format == 'best':
3297 if 'hd2' in supported_format:
3302 elif format == 'worst':
3310 fileid = config['data'][0]['streamfileids'][format]
3311 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3312 except (UnicodeDecodeError, ValueError, KeyError):
3313 self._downloader.trouble(u'ERROR: unable to extract info section')
3317 sid = self._gen_sid()
3318 fileid = self._get_file_id(fileid, seed)
3320 #column 8,9 of fileid represent the segment number
3321 #fileid[7:9] should be changed
3322 for index, key in enumerate(keys):
3324 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3325 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3328 'id': '%s_part%02d' % (video_id, index),
3329 'url': download_url,
3331 'upload_date': None,
3332 'title': video_title,
3335 files_info.append(info)
3340 class XNXXIE(InfoExtractor):
3341 """Information extractor for xnxx.com"""
3343 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3345 VIDEO_URL_RE = r'flv_url=(.*?)&'
3346 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3347 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3349 def report_webpage(self, video_id):
3350 """Report information extraction"""
3351 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3353 def report_extraction(self, video_id):
3354 """Report information extraction"""
3355 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3357 def _real_extract(self, url):
3358 mobj = re.match(self._VALID_URL, url)
3360 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3362 video_id = mobj.group(1)
3364 self.report_webpage(video_id)
3366 # Get webpage content
3368 webpage_bytes = compat_urllib_request.urlopen(url).read()
3369 webpage = webpage_bytes.decode('utf-8')
3370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3371 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3374 result = re.search(self.VIDEO_URL_RE, webpage)
3376 self._downloader.trouble(u'ERROR: unable to extract video url')
3378 video_url = compat_urllib_parse.unquote(result.group(1))
3380 result = re.search(self.VIDEO_TITLE_RE, webpage)
3382 self._downloader.trouble(u'ERROR: unable to extract video title')
3384 video_title = result.group(1)
3386 result = re.search(self.VIDEO_THUMB_RE, webpage)
3388 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3390 video_thumbnail = result.group(1)
3396 'upload_date': None,
3397 'title': video_title,
3399 'thumbnail': video_thumbnail,
3400 'description': None,
3404 class GooglePlusIE(InfoExtractor):
3405 """Information extractor for plus.google.com."""
3407 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3408 IE_NAME = u'plus.google'
3410 def __init__(self, downloader=None):
3411 InfoExtractor.__init__(self, downloader)
3413 def report_extract_entry(self, url):
3414 """Report downloading extry"""
3415 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3417 def report_date(self, upload_date):
3418 """Report downloading extry"""
3419 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3421 def report_uploader(self, uploader):
3422 """Report downloading extry"""
3423 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3425 def report_title(self, video_title):
3426 """Report downloading extry"""
3427 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3429 def report_extract_vid_page(self, video_page):
3430 """Report information extraction."""
3431 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3433 def _real_extract(self, url):
3434 # Extract id from URL
3435 mobj = re.match(self._VALID_URL, url)
3437 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3440 post_url = mobj.group(0)
3441 video_id = mobj.group(1)
3443 video_extension = 'flv'
3445 # Step 1, Retrieve post webpage to extract further information
3446 self.report_extract_entry(post_url)
3447 request = compat_urllib_request.Request(post_url)
3449 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3450 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3451 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3454 # Extract update date
3456 pattern = 'title="Timestamp">(.*?)</a>'
3457 mobj = re.search(pattern, webpage)
3459 upload_date = mobj.group(1)
3460 # Convert timestring to a format suitable for filename
3461 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3462 upload_date = upload_date.strftime('%Y%m%d')
3463 self.report_date(upload_date)
3467 pattern = r'rel\="author".*?>(.*?)</a>'
3468 mobj = re.search(pattern, webpage)
3470 uploader = mobj.group(1)
3471 self.report_uploader(uploader)
3474 # Get the first line for title
3476 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3477 mobj = re.search(pattern, webpage)
3479 video_title = mobj.group(1)
3480 self.report_title(video_title)
3482 # Step 2, Stimulate clicking the image box to launch video
3483 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3484 mobj = re.search(pattern, webpage)
3486 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3488 video_page = mobj.group(1)
3489 request = compat_urllib_request.Request(video_page)
3491 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3495 self.report_extract_vid_page(video_page)
3498 # Extract video links on video page
3499 """Extract video links of all sizes"""
3500 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3501 mobj = re.findall(pattern, webpage)
3503 self._downloader.trouble(u'ERROR: unable to extract video links')
3505 # Sort in resolution
3506 links = sorted(mobj)
3508 # Choose the lowest of the sort, i.e. highest resolution
3509 video_url = links[-1]
3510 # Only get the url. The resolution part in the tuple has no use anymore
3511 video_url = video_url[-1]
3512 # Treat escaped \u0026 style hex
3514 video_url = video_url.decode("unicode_escape")
3515 except AttributeError: # Python 3
3516 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3522 'uploader': uploader,
3523 'upload_date': upload_date,
3524 'title': video_title,
3525 'ext': video_extension,
3528 class NBAIE(InfoExtractor):
3529 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3532 def report_extraction(self, video_id):
3533 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3535 def _real_extract(self, url):
3536 mobj = re.match(self._VALID_URL, url)
3538 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3541 video_id = mobj.group(1)
3542 if video_id.endswith('/index.html'):
3543 video_id = video_id[:-len('/index.html')]
3545 self.report_extraction(video_id)
3547 urlh = compat_urllib_request.urlopen(url)
3548 webpage_bytes = urlh.read()
3549 webpage = webpage_bytes.decode('utf-8', 'ignore')
3550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3551 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3554 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3555 def _findProp(rexp, default=None):
3556 m = re.search(rexp, webpage)
3558 return unescapeHTML(m.group(1))
3562 shortened_video_id = video_id.rpartition('/')[2]
3563 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3565 'id': shortened_video_id,
3569 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3570 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3574 class JustinTVIE(InfoExtractor):
3575 """Information extractor for justin.tv and twitch.tv"""
3576 # TODO: One broadcast may be split into multiple videos. The key
3577 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3578 # starts at 1 and increases. Can we treat all parts as one video?
3580 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3581 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3582 _JUSTIN_PAGE_LIMIT = 100
3583 IE_NAME = u'justin.tv'
3585 def report_extraction(self, file_id):
3586 """Report information extraction."""
3587 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3589 def report_download_page(self, channel, offset):
3590 """Report attempt to download a single page of videos."""
3591 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3592 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3594 # Return count of items, list of *valid* items
3595 def _parse_page(self, url):
3597 urlh = compat_urllib_request.urlopen(url)
3598 webpage_bytes = urlh.read()
3599 webpage = webpage_bytes.decode('utf-8', 'ignore')
3600 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3601 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3604 response = json.loads(webpage)
3606 for clip in response:
3607 video_url = clip['video_file_url']
3609 video_extension = os.path.splitext(video_url)[1][1:]
3610 video_date = re.sub('-', '', clip['created_on'][:10])
3614 'title': clip['title'],
3615 'uploader': clip.get('user_id', clip.get('channel_id')),
3616 'upload_date': video_date,
3617 'ext': video_extension,
3619 return (len(response), info)
3621 def _real_extract(self, url):
3622 mobj = re.match(self._VALID_URL, url)
3624 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3627 api = 'http://api.justin.tv'
3628 video_id = mobj.group(mobj.lastindex)
3630 if mobj.lastindex == 1:
3632 api += '/channel/archives/%s.json'
3634 api += '/clip/show/%s.json'
3635 api = api % (video_id,)
3637 self.report_extraction(video_id)
3641 limit = self._JUSTIN_PAGE_LIMIT
3644 self.report_download_page(video_id, offset)
3645 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3646 page_count, page_info = self._parse_page(page_url)
3647 info.extend(page_info)
3648 if not paged or page_count != limit:
3653 class FunnyOrDieIE(InfoExtractor):
3654 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3655 IE_NAME = u'FunnyOrDie'
3657 def report_extraction(self, video_id):
3658 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3660 def _real_extract(self, url):
3661 mobj = re.match(self._VALID_URL, url)
3663 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3666 video_id = mobj.group('id')
3667 self.report_extraction(video_id)
3669 urlh = compat_urllib_request.urlopen(url)
3670 webpage_bytes = urlh.read()
3671 webpage = webpage_bytes.decode('utf-8', 'ignore')
3672 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3673 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3676 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3678 self._downloader.trouble(u'ERROR: unable to find video information')
3679 video_url = unescapeHTML(m.group('url'))
3681 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3683 self._downloader.trouble(u'Cannot find video title')
3684 title = unescapeHTML(m.group('title'))
3686 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3688 desc = unescapeHTML(m.group('desc'))
3697 'description': desc,
3701 class TweetReelIE(InfoExtractor):
3702 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3704 def report_extraction(self, video_id):
3705 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3707 def _real_extract(self, url):
3708 mobj = re.match(self._VALID_URL, url)
3710 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3713 video_id = mobj.group('id')
3714 self.report_extraction(video_id)
3716 urlh = compat_urllib_request.urlopen(url)
3717 webpage_bytes = urlh.read()
3718 webpage = webpage_bytes.decode('utf-8', 'ignore')
3719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3723 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3725 self._downloader.trouble(u'ERROR: Cannot find status ID')
3726 status_id = m.group(1)
3728 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3730 self._downloader.trouble(u'WARNING: Cannot find description')
3731 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3733 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3735 self._downloader.trouble(u'ERROR: Cannot find uploader')
3736 uploader = unescapeHTML(m.group('uploader'))
3737 uploader_id = unescapeHTML(m.group('uploader_id'))
3739 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3741 self._downloader.trouble(u'ERROR: Cannot find upload date')
3742 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3745 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3752 'description': desc,
3753 'uploader': uploader,
3754 'uploader_id': uploader_id,
3755 'internal_id': status_id,
3756 'upload_date': upload_date
3760 class SteamIE(InfoExtractor):
3761 _VALID_URL = r"""http://store.steampowered.com/
3762 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3764 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3768 def suitable(self, url):
3769 """Receives a URL and returns True if suitable for this IE."""
3770 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3772 def report_download_video_page(self, game_id):
3773 self._downloader.to_screen(u'[%s] %s: Downloading video page' % (self.IE_NAME, game_id))
3775 def _real_extract(self, url):
3776 m = re.match(self._VALID_URL, url, re.VERBOSE)
3777 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3778 gameID = m.group('gameID')
3779 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3781 self.report_download_video_page(gameID)
3782 urlh = compat_urllib_request.urlopen(videourl)
3783 webpage_bytes = urlh.read()
3784 webpage = webpage_bytes.decode('utf-8', 'ignore')
3785 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3786 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3788 mweb = re.finditer(urlRE, webpage)
3789 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3790 titles = list(re.finditer(namesRE, webpage))
3794 video_id = vid.group('videoID')
3795 title = titles[i].group('videoName')
3796 video_url=vid.group('videoURL')
3798 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3809 class UstreamIE(InfoExtractor):
3810 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3811 IE_NAME = u'ustream'
3813 def _real_extract(self, url):
3814 m = re.match(self._VALID_URL, url)
3815 video_id = m.group('videoID')
3816 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3818 urlh = compat_urllib_request.urlopen(url)
3819 webpage_bytes = urlh.read()
3820 webpage = webpage_bytes.decode('utf-8', 'ignore')
3821 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3822 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3824 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3825 title = m.group('title')
3826 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3827 uploader = m.group('uploader')
3833 'uploader': uploader