2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # License: Public domain code
32 # parse_qs was moved from the cgi module to the urlparse module recently.
34 from urlparse import parse_qs
36 from cgi import parse_qs
39 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
40 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
41 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
42 'Accept-Encoding': 'gzip, deflate',
43 'Accept-Language': 'en-us,en;q=0.5',
46 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
48 def preferredencoding():
49 """Get preferred encoding.
51 Returns the best encoding scheme for the system, based on
52 locale.getpreferredencoding() and some further tweaks.
54 def yield_preferredencoding():
56 pref = locale.getpreferredencoding()
62 return yield_preferredencoding().next()
64 def htmlentity_transform(matchobj):
65 """Transforms an HTML entity to a Unicode character.
67 This function receives a match object and is intended to be used with
68 the re.sub() function.
70 entity = matchobj.group(1)
72 # Known non-numeric HTML entity
73 if entity in htmlentitydefs.name2codepoint:
74 return unichr(htmlentitydefs.name2codepoint[entity])
77 mobj = re.match(ur'(?u)#(x?\d+)', entity)
79 numstr = mobj.group(1)
80 if numstr.startswith(u'x'):
82 numstr = u'0%s' % numstr
85 return unichr(long(numstr, base))
87 # Unknown entity in name, return its literal representation
88 return (u'&%s;' % entity)
90 def sanitize_title(utitle):
91 """Sanitizes a video title so it could be used as part of a filename."""
92 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
93 return utitle.replace(unicode(os.sep), u'%')
95 def sanitize_open(filename, open_mode):
96 """Try to open the given filename, and slightly tweak it if this fails.
98 Attempts to open the given filename. If this fails, it tries to change
99 the filename slightly, step by step, until it's either able to open it
100 or it fails and raises a final exception, like the standard open()
103 It returns the tuple (stream, definitive_file_name).
107 if sys.platform == 'win32':
109 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
110 return (sys.stdout, filename)
111 stream = open(filename, open_mode)
112 return (stream, filename)
113 except (IOError, OSError), err:
114 # In case of error, try to remove win32 forbidden chars
115 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
117 # An exception here should be caught in the caller
118 stream = open(filename, open_mode)
119 return (stream, filename)
121 def timeconvert(timestr):
122 """Convert RFC 2822 defined time string into system timestamp"""
124 timetuple = email.utils.parsedate_tz(timestr)
125 if timetuple is not None:
126 timestamp = email.utils.mktime_tz(timetuple)
129 class DownloadError(Exception):
130 """Download Error exception.
132 This exception may be thrown by FileDownloader objects if they are not
133 configured to continue on errors. They will contain the appropriate
138 class SameFileError(Exception):
139 """Same File exception.
141 This exception will be thrown by FileDownloader objects if they detect
142 multiple files would have to be downloaded to the same file on disk.
146 class PostProcessingError(Exception):
147 """Post Processing exception.
149 This exception may be raised by PostProcessor's .run() method to
150 indicate an error in the postprocessing task.
154 class UnavailableVideoError(Exception):
155 """Unavailable Format exception.
157 This exception will be thrown when a video is requested
158 in a format that is not available for that video.
162 class ContentTooShortError(Exception):
163 """Content Too Short exception.
165 This exception may be raised by FileDownloader objects when a file they
166 download is too small for what the server announced first, indicating
167 the connection was probably interrupted.
173 def __init__(self, downloaded, expected):
174 self.downloaded = downloaded
175 self.expected = expected
177 class YoutubeDLHandler(urllib2.HTTPHandler):
178 """Handler for HTTP requests and responses.
180 This class, when installed with an OpenerDirector, automatically adds
181 the standard headers to every HTTP request and handles gzipped and
182 deflated responses from web servers. If compression is to be avoided in
183 a particular request, the original request in the program code only has
184 to include the HTTP header "Youtubedl-No-Compression", which will be
185 removed before making the real request.
187 Part of this code was copied from:
189 http://techknack.net/python-urllib2-handlers/
191 Andrew Rowls, the author of that code, agreed to release it to the
198 return zlib.decompress(data, -zlib.MAX_WBITS)
200 return zlib.decompress(data)
203 def addinfourl_wrapper(stream, headers, url, code):
204 if hasattr(urllib2.addinfourl, 'getcode'):
205 return urllib2.addinfourl(stream, headers, url, code)
206 ret = urllib2.addinfourl(stream, headers, url)
210 def http_request(self, req):
211 for h in std_headers:
214 req.add_header(h, std_headers[h])
215 if 'Youtubedl-no-compression' in req.headers:
216 if 'Accept-encoding' in req.headers:
217 del req.headers['Accept-encoding']
218 del req.headers['Youtubedl-no-compression']
221 def http_response(self, req, resp):
224 if resp.headers.get('Content-encoding', '') == 'gzip':
225 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
226 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
227 resp.msg = old_resp.msg
229 if resp.headers.get('Content-encoding', '') == 'deflate':
230 gz = StringIO.StringIO(self.deflate(resp.read()))
231 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
232 resp.msg = old_resp.msg
235 class FileDownloader(object):
236 """File Downloader class.
238 File downloader objects are the ones responsible of downloading the
239 actual video file and writing it to disk if the user has requested
240 it, among some other tasks. In most cases there should be one per
241 program. As, given a video URL, the downloader doesn't know how to
242 extract all the needed information, task that InfoExtractors do, it
243 has to pass the URL to one of them.
245 For this, file downloader objects have a method that allows
246 InfoExtractors to be registered in a given order. When it is passed
247 a URL, the file downloader handles it to the first InfoExtractor it
248 finds that reports being able to handle it. The InfoExtractor extracts
249 all the information about the video or videos the URL refers to, and
250 asks the FileDownloader to process the video information, possibly
251 downloading the video.
253 File downloaders accept a lot of parameters. In order not to saturate
254 the object constructor with arguments, it receives a dictionary of
255 options instead. These options are available through the params
256 attribute for the InfoExtractors to use. The FileDownloader also
257 registers itself as the downloader in charge for the InfoExtractors
258 that are added to it, so this is a "mutual registration".
262 username: Username for authentication purposes.
263 password: Password for authentication purposes.
264 usenetrc: Use netrc for authentication instead.
265 quiet: Do not print messages to stdout.
266 forceurl: Force printing final URL.
267 forcetitle: Force printing title.
268 forcethumbnail: Force printing thumbnail URL.
269 forcedescription: Force printing description.
270 forcefilename: Force printing final filename.
271 simulate: Do not download the video files.
272 format: Video format code.
273 format_limit: Highest quality format to try.
274 outtmpl: Template for output names.
275 ignoreerrors: Do not stop on download errors.
276 ratelimit: Download speed limit, in bytes/sec.
277 nooverwrites: Prevent overwriting files.
278 retries: Number of times to retry for HTTP error 5xx
279 continuedl: Try to continue downloads if possible.
280 noprogress: Do not print the progress bar.
281 playliststart: Playlist item to start at.
282 playlistend: Playlist item to end at.
283 logtostderr: Log messages to stderr instead of stdout.
284 consoletitle: Display progress in console window's titlebar.
285 nopart: Do not use temporary .part files.
286 updatetime: Use the Last-modified header to set output file timestamps.
292 _download_retcode = None
293 _num_downloads = None
296 def __init__(self, params):
297 """Create a FileDownloader object with the given options."""
300 self._download_retcode = 0
301 self._num_downloads = 0
302 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
306 def pmkdir(filename):
307 """Create directory components in filename. Similar to Unix "mkdir -p"."""
308 components = filename.split(os.sep)
309 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
310 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
311 for dir in aggregate:
312 if not os.path.exists(dir):
316 def format_bytes(bytes):
319 if type(bytes) is str:
324 exponent = long(math.log(bytes, 1024.0))
325 suffix = 'bkMGTPEZY'[exponent]
326 converted = float(bytes) / float(1024**exponent)
327 return '%.2f%s' % (converted, suffix)
330 def calc_percent(byte_counter, data_len):
333 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
336 def calc_eta(start, now, total, current):
340 if current == 0 or dif < 0.001: # One millisecond
342 rate = float(current) / dif
343 eta = long((float(total) - float(current)) / rate)
344 (eta_mins, eta_secs) = divmod(eta, 60)
347 return '%02d:%02d' % (eta_mins, eta_secs)
350 def calc_speed(start, now, bytes):
352 if bytes == 0 or dif < 0.001: # One millisecond
353 return '%10s' % '---b/s'
354 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
357 def best_block_size(elapsed_time, bytes):
358 new_min = max(bytes / 2.0, 1.0)
359 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
360 if elapsed_time < 0.001:
362 rate = bytes / elapsed_time
370 def parse_bytes(bytestr):
371 """Parse a string indicating a byte quantity into a long integer."""
372 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
375 number = float(matchobj.group(1))
376 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
377 return long(round(number * multiplier))
379 def add_info_extractor(self, ie):
380 """Add an InfoExtractor object to the end of the list."""
382 ie.set_downloader(self)
384 def add_post_processor(self, pp):
385 """Add a PostProcessor object to the end of the chain."""
387 pp.set_downloader(self)
389 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
390 """Print message to stdout if not in quiet mode."""
392 if not self.params.get('quiet', False):
393 terminator = [u'\n', u''][skip_eol]
394 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
395 self._screen_file.flush()
396 except (UnicodeEncodeError), err:
397 if not ignore_encoding_errors:
400 def to_stderr(self, message):
401 """Print message to stderr."""
402 print >>sys.stderr, message.encode(preferredencoding())
404 def to_cons_title(self, message):
405 """Set console/terminal window title to message."""
406 if not self.params.get('consoletitle', False):
408 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
409 # c_wchar_p() might not be necessary if `message` is
410 # already of type unicode()
411 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
412 elif 'TERM' in os.environ:
413 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
415 def fixed_template(self):
416 """Checks if the output template is fixed."""
417 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
419 def trouble(self, message=None):
420 """Determine action to take when a download problem appears.
422 Depending on if the downloader has been configured to ignore
423 download errors or not, this method may throw an exception or
424 not when errors are found, after printing the message.
426 if message is not None:
427 self.to_stderr(message)
428 if not self.params.get('ignoreerrors', False):
429 raise DownloadError(message)
430 self._download_retcode = 1
432 def slow_down(self, start_time, byte_counter):
433 """Sleep if the download speed is over the rate limit."""
434 rate_limit = self.params.get('ratelimit', None)
435 if rate_limit is None or byte_counter == 0:
438 elapsed = now - start_time
441 speed = float(byte_counter) / elapsed
442 if speed > rate_limit:
443 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
445 def temp_name(self, filename):
446 """Returns a temporary filename for the given filename."""
447 if self.params.get('nopart', False) or filename == u'-' or \
448 (os.path.exists(filename) and not os.path.isfile(filename)):
450 return filename + u'.part'
452 def undo_temp_name(self, filename):
453 if filename.endswith(u'.part'):
454 return filename[:-len(u'.part')]
457 def try_rename(self, old_filename, new_filename):
459 if old_filename == new_filename:
461 os.rename(old_filename, new_filename)
462 except (IOError, OSError), err:
463 self.trouble(u'ERROR: unable to rename file')
465 def try_utime(self, filename, last_modified_hdr):
466 """Try to set the last-modified time of the given file."""
467 if last_modified_hdr is None:
469 if not os.path.isfile(filename):
471 timestr = last_modified_hdr
474 filetime = timeconvert(timestr)
478 os.utime(filename,(time.time(), filetime))
482 def report_destination(self, filename):
483 """Report destination filename."""
484 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
486 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
487 """Report download progress."""
488 if self.params.get('noprogress', False):
490 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
491 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
492 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
493 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
495 def report_resuming_byte(self, resume_len):
496 """Report attempt to resume at given byte."""
497 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
499 def report_retry(self, count, retries):
500 """Report retry in case of HTTP error 5xx"""
501 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
503 def report_file_already_downloaded(self, file_name):
504 """Report file has already been fully downloaded."""
506 self.to_screen(u'[download] %s has already been downloaded' % file_name)
507 except (UnicodeEncodeError), err:
508 self.to_screen(u'[download] The file has already been downloaded')
510 def report_unable_to_resume(self):
511 """Report it was impossible to resume download."""
512 self.to_screen(u'[download] Unable to resume')
514 def report_finish(self):
515 """Report download finished."""
516 if self.params.get('noprogress', False):
517 self.to_screen(u'[download] Download completed')
521 def increment_downloads(self):
522 """Increment the ordinal that assigns a number to each file."""
523 self._num_downloads += 1
525 def prepare_filename(self, info_dict):
526 """Generate the output filename."""
528 template_dict = dict(info_dict)
529 template_dict['epoch'] = unicode(long(time.time()))
530 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
531 filename = self.params['outtmpl'] % template_dict
533 except (ValueError, KeyError), err:
534 self.trouble(u'ERROR: invalid system charset or erroneous output template')
537 def process_info(self, info_dict):
538 """Process a single dictionary returned by an InfoExtractor."""
539 filename = self.prepare_filename(info_dict)
540 # Do nothing else if in simulate mode
541 if self.params.get('simulate', False):
543 if self.params.get('forcetitle', False):
544 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
545 if self.params.get('forceurl', False):
546 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
547 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
548 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
549 if self.params.get('forcedescription', False) and 'description' in info_dict:
550 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
551 if self.params.get('forcefilename', False) and filename is not None:
552 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
558 if self.params.get('nooverwrites', False) and os.path.exists(filename):
559 self.to_stderr(u'WARNING: file exists and will be skipped')
563 self.pmkdir(filename)
564 except (OSError, IOError), err:
565 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
569 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
570 except (OSError, IOError), err:
571 raise UnavailableVideoError
572 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
573 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
575 except (ContentTooShortError, ), err:
576 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
581 self.post_process(filename, info_dict)
582 except (PostProcessingError), err:
583 self.trouble(u'ERROR: postprocessing: %s' % str(err))
586 def download(self, url_list):
587 """Download a given list of URLs."""
588 if len(url_list) > 1 and self.fixed_template():
589 raise SameFileError(self.params['outtmpl'])
592 suitable_found = False
594 # Go to next InfoExtractor if not suitable
595 if not ie.suitable(url):
598 # Suitable InfoExtractor found
599 suitable_found = True
601 # Extract information from URL and process it
604 # Suitable InfoExtractor had been found; go to next URL
607 if not suitable_found:
608 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
610 return self._download_retcode
612 def post_process(self, filename, ie_info):
613 """Run the postprocessing chain on the given file."""
615 info['filepath'] = filename
621 def _download_with_rtmpdump(self, filename, url, player_url):
622 self.report_destination(filename)
623 tmpfilename = self.temp_name(filename)
625 # Check for rtmpdump first
627 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
628 except (OSError, IOError):
629 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
632 # Download using rtmpdump. rtmpdump returns exit code 2 when
633 # the connection was interrumpted and resuming appears to be
634 # possible. This is part of rtmpdump's normal usage, AFAIK.
635 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
636 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
637 while retval == 2 or retval == 1:
638 prevsize = os.path.getsize(tmpfilename)
639 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
640 time.sleep(5.0) # This seems to be needed
641 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
642 cursize = os.path.getsize(tmpfilename)
643 if prevsize == cursize and retval == 1:
646 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
647 self.try_rename(tmpfilename, filename)
650 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
653 def _do_download(self, filename, url, player_url):
654 # Check file already present
655 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
656 self.report_file_already_downloaded(filename)
659 # Attempt to download using rtmpdump
660 if url.startswith('rtmp'):
661 return self._download_with_rtmpdump(filename, url, player_url)
663 tmpfilename = self.temp_name(filename)
667 # Do not include the Accept-Encoding header
668 headers = {'Youtubedl-no-compression': 'True'}
669 basic_request = urllib2.Request(url, None, headers)
670 request = urllib2.Request(url, None, headers)
672 # Establish possible resume length
673 if os.path.isfile(tmpfilename):
674 resume_len = os.path.getsize(tmpfilename)
678 # Request parameters in case of being able to resume
679 if self.params.get('continuedl', False) and resume_len != 0:
680 self.report_resuming_byte(resume_len)
681 request.add_header('Range','bytes=%d-' % resume_len)
685 retries = self.params.get('retries', 0)
686 while count <= retries:
687 # Establish connection
689 data = urllib2.urlopen(request)
691 except (urllib2.HTTPError, ), err:
692 if (err.code < 500 or err.code >= 600) and err.code != 416:
693 # Unexpected HTTP error
695 elif err.code == 416:
696 # Unable to resume (requested range not satisfiable)
698 # Open the connection again without the range header
699 data = urllib2.urlopen(basic_request)
700 content_length = data.info()['Content-Length']
701 except (urllib2.HTTPError, ), err:
702 if err.code < 500 or err.code >= 600:
705 # Examine the reported length
706 if (content_length is not None and
707 (resume_len - 100 < long(content_length) < resume_len + 100)):
708 # The file had already been fully downloaded.
709 # Explanation to the above condition: in issue #175 it was revealed that
710 # YouTube sometimes adds or removes a few bytes from the end of the file,
711 # changing the file size slightly and causing problems for some users. So
712 # I decided to implement a suggested change and consider the file
713 # completely downloaded if the file size differs less than 100 bytes from
714 # the one in the hard drive.
715 self.report_file_already_downloaded(filename)
716 self.try_rename(tmpfilename, filename)
719 # The length does not match, we start the download over
720 self.report_unable_to_resume()
726 self.report_retry(count, retries)
729 self.trouble(u'ERROR: giving up after %s retries' % retries)
732 data_len = data.info().get('Content-length', None)
733 if data_len is not None:
734 data_len = long(data_len) + resume_len
735 data_len_str = self.format_bytes(data_len)
736 byte_counter = 0 + resume_len
742 data_block = data.read(block_size)
744 if len(data_block) == 0:
746 byte_counter += len(data_block)
748 # Open file just in time
751 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
752 filename = self.undo_temp_name(tmpfilename)
753 self.report_destination(filename)
754 except (OSError, IOError), err:
755 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
758 stream.write(data_block)
759 except (IOError, OSError), err:
760 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
762 block_size = self.best_block_size(after - before, len(data_block))
765 percent_str = self.calc_percent(byte_counter, data_len)
766 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
767 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
768 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
771 self.slow_down(start, byte_counter - resume_len)
775 if data_len is not None and byte_counter != data_len:
776 raise ContentTooShortError(byte_counter, long(data_len))
777 self.try_rename(tmpfilename, filename)
779 # Update file modification time
780 if self.params.get('updatetime', True):
781 self.try_utime(filename, data.info().get('last-modified', None))
785 class InfoExtractor(object):
786 """Information Extractor class.
788 Information extractors are the classes that, given a URL, extract
789 information from the video (or videos) the URL refers to. This
790 information includes the real video URL, the video title and simplified
791 title, author and others. The information is stored in a dictionary
792 which is then passed to the FileDownloader. The FileDownloader
793 processes this information possibly downloading the video to the file
794 system, among other possible outcomes. The dictionaries must include
795 the following fields:
797 id: Video identifier.
798 url: Final video URL.
799 uploader: Nickname of the video uploader.
800 title: Literal title.
801 stitle: Simplified title.
802 ext: Video filename extension.
803 format: Video format.
804 player_url: SWF Player URL (may be None).
806 The following fields are optional. Their primary purpose is to allow
807 youtube-dl to serve as the backend for a video search function, such
808 as the one in youtube2mp3. They are only used when their respective
809 forced printing functions are called:
811 thumbnail: Full URL to a video thumbnail image.
812 description: One-line video description.
814 Subclasses of this one should re-define the _real_initialize() and
815 _real_extract() methods, as well as the suitable() static method.
816 Probably, they should also be instantiated and added to the main
823 def __init__(self, downloader=None):
824 """Constructor. Receives an optional downloader."""
826 self.set_downloader(downloader)
830 """Receives a URL and returns True if suitable for this IE."""
833 def initialize(self):
834 """Initializes an instance (authentication, etc)."""
836 self._real_initialize()
839 def extract(self, url):
840 """Extracts URL information and returns it in list of dicts."""
842 return self._real_extract(url)
844 def set_downloader(self, downloader):
845 """Sets the downloader for this IE."""
846 self._downloader = downloader
848 def _real_initialize(self):
849 """Real initialization process. Redefine in subclasses."""
852 def _real_extract(self, url):
853 """Real extraction process. Redefine in subclasses."""
856 class YoutubeIE(InfoExtractor):
857 """Information extractor for youtube.com."""
859 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
860 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
861 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
862 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
863 _NETRC_MACHINE = 'youtube'
864 # Listed in order of quality
865 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
866 _video_extensions = {
872 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
879 return (re.match(YoutubeIE._VALID_URL, url) is not None)
881 def report_lang(self):
882 """Report attempt to set language."""
883 self._downloader.to_screen(u'[youtube] Setting language')
885 def report_login(self):
886 """Report attempt to log in."""
887 self._downloader.to_screen(u'[youtube] Logging in')
889 def report_age_confirmation(self):
890 """Report attempt to confirm age."""
891 self._downloader.to_screen(u'[youtube] Confirming age')
893 def report_video_webpage_download(self, video_id):
894 """Report attempt to download video webpage."""
895 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
897 def report_video_info_webpage_download(self, video_id):
898 """Report attempt to download video info webpage."""
899 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
901 def report_information_extraction(self, video_id):
902 """Report attempt to extract video information."""
903 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
905 def report_unavailable_format(self, video_id, format):
906 """Report extracted video URL."""
907 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
909 def report_rtmp_download(self):
910 """Indicate the download will use the RTMP protocol."""
911 self._downloader.to_screen(u'[youtube] RTMP download detected')
913 def _real_initialize(self):
914 if self._downloader is None:
919 downloader_params = self._downloader.params
921 # Attempt to use provided username and password or .netrc data
922 if downloader_params.get('username', None) is not None:
923 username = downloader_params['username']
924 password = downloader_params['password']
925 elif downloader_params.get('usenetrc', False):
927 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
932 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
933 except (IOError, netrc.NetrcParseError), err:
934 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
938 request = urllib2.Request(self._LANG_URL)
941 urllib2.urlopen(request).read()
942 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
943 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
946 # No authentication to be performed
952 'current_form': 'loginForm',
954 'action_login': 'Log In',
955 'username': username,
956 'password': password,
958 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
961 login_results = urllib2.urlopen(request).read()
962 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
963 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
965 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
966 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
972 'action_confirm': 'Confirm',
974 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
976 self.report_age_confirmation()
977 age_results = urllib2.urlopen(request).read()
978 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
979 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
982 def _real_extract(self, url):
983 # Extract video id from URL
984 mobj = re.match(self._VALID_URL, url)
986 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
988 video_id = mobj.group(2)
991 self.report_video_webpage_download(video_id)
992 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
994 video_webpage = urllib2.urlopen(request).read()
995 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
996 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
999 # Attempt to extract SWF player URL
1000 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1001 if mobj is not None:
1002 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1007 self.report_video_info_webpage_download(video_id)
1008 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1009 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1010 % (video_id, el_type))
1011 request = urllib2.Request(video_info_url)
1013 video_info_webpage = urllib2.urlopen(request).read()
1014 video_info = parse_qs(video_info_webpage)
1015 if 'token' in video_info:
1017 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1018 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1020 if 'token' not in video_info:
1021 if 'reason' in video_info:
1022 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1024 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1027 # Start extracting information
1028 self.report_information_extraction(video_id)
1031 if 'author' not in video_info:
1032 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1034 video_uploader = urllib.unquote_plus(video_info['author'][0])
1037 if 'title' not in video_info:
1038 self._downloader.trouble(u'ERROR: unable to extract video title')
1040 video_title = urllib.unquote_plus(video_info['title'][0])
1041 video_title = video_title.decode('utf-8')
1042 video_title = sanitize_title(video_title)
1045 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1046 simple_title = simple_title.strip(ur'_')
1049 if 'thumbnail_url' not in video_info:
1050 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1051 video_thumbnail = ''
1052 else: # don't panic if we can't find it
1053 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1057 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1058 if mobj is not None:
1059 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1060 format_expressions = ['%d %B %Y', '%B %d %Y']
1061 for expression in format_expressions:
1063 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1068 video_description = 'No description available.'
1069 if self._downloader.params.get('forcedescription', False):
1070 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1071 if mobj is not None:
1072 video_description = mobj.group(1)
1075 video_token = urllib.unquote_plus(video_info['token'][0])
1077 # Decide which formats to download
1078 req_format = self._downloader.params.get('format', None)
1080 if 'fmt_url_map' in video_info:
1081 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1082 format_limit = self._downloader.params.get('format_limit', None)
1083 if format_limit is not None and format_limit in self._available_formats:
1084 format_list = self._available_formats[self._available_formats.index(format_limit):]
1086 format_list = self._available_formats
1087 existing_formats = [x for x in format_list if x in url_map]
1088 if len(existing_formats) == 0:
1089 self._downloader.trouble(u'ERROR: no known formats available for video')
1091 if req_format is None:
1092 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1093 elif req_format == '-1':
1094 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1097 if req_format not in url_map:
1098 self._downloader.trouble(u'ERROR: requested format not available')
1100 video_url_list = [(req_format, url_map[req_format])] # Specific format
1102 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1103 self.report_rtmp_download()
1104 video_url_list = [(None, video_info['conn'][0])]
1107 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1110 for format_param, video_real_url in video_url_list:
1111 # At this point we have a new video
1112 self._downloader.increment_downloads()
1115 video_extension = self._video_extensions.get(format_param, 'flv')
1117 # Find the video URL in fmt_url_map or conn paramters
1119 # Process video information
1120 self._downloader.process_info({
1121 'id': video_id.decode('utf-8'),
1122 'url': video_real_url.decode('utf-8'),
1123 'uploader': video_uploader.decode('utf-8'),
1124 'upload_date': upload_date,
1125 'title': video_title,
1126 'stitle': simple_title,
1127 'ext': video_extension.decode('utf-8'),
1128 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1129 'thumbnail': video_thumbnail.decode('utf-8'),
1130 'description': video_description.decode('utf-8'),
1131 'player_url': player_url,
1133 except UnavailableVideoError, err:
1134 self._downloader.trouble(u'\nERROR: unable to download video')
1137 class MetacafeIE(InfoExtractor):
1138 """Information Extractor for metacafe.com."""
1140 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1141 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1142 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1145 def __init__(self, youtube_ie, downloader=None):
1146 InfoExtractor.__init__(self, downloader)
1147 self._youtube_ie = youtube_ie
1151 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1153 def report_disclaimer(self):
1154 """Report disclaimer retrieval."""
1155 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1157 def report_age_confirmation(self):
1158 """Report attempt to confirm age."""
1159 self._downloader.to_screen(u'[metacafe] Confirming age')
1161 def report_download_webpage(self, video_id):
1162 """Report webpage download."""
1163 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1165 def report_extraction(self, video_id):
1166 """Report information extraction."""
1167 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1169 def _real_initialize(self):
1170 # Retrieve disclaimer
1171 request = urllib2.Request(self._DISCLAIMER)
1173 self.report_disclaimer()
1174 disclaimer = urllib2.urlopen(request).read()
1175 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1182 'submit': "Continue - I'm over 18",
1184 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1186 self.report_age_confirmation()
1187 disclaimer = urllib2.urlopen(request).read()
1188 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1189 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1192 def _real_extract(self, url):
1193 # Extract id and simplified title from URL
1194 mobj = re.match(self._VALID_URL, url)
1196 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1199 video_id = mobj.group(1)
1201 # Check if video comes from YouTube
1202 mobj2 = re.match(r'^yt-(.*)$', video_id)
1203 if mobj2 is not None:
1204 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1207 # At this point we have a new video
1208 self._downloader.increment_downloads()
1210 simple_title = mobj.group(2).decode('utf-8')
1212 # Retrieve video webpage to extract further information
1213 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1215 self.report_download_webpage(video_id)
1216 webpage = urllib2.urlopen(request).read()
1217 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1218 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1221 # Extract URL, uploader and title from webpage
1222 self.report_extraction(video_id)
1223 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1224 if mobj is not None:
1225 mediaURL = urllib.unquote(mobj.group(1))
1226 video_extension = mediaURL[-3:]
1228 # Extract gdaKey if available
1229 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1231 video_url = mediaURL
1233 gdaKey = mobj.group(1)
1234 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1236 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1238 self._downloader.trouble(u'ERROR: unable to extract media URL')
1240 vardict = parse_qs(mobj.group(1))
1241 if 'mediaData' not in vardict:
1242 self._downloader.trouble(u'ERROR: unable to extract media URL')
1244 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1246 self._downloader.trouble(u'ERROR: unable to extract media URL')
1248 mediaURL = mobj.group(1).replace('\\/', '/')
1249 video_extension = mediaURL[-3:]
1250 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1252 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1254 self._downloader.trouble(u'ERROR: unable to extract title')
1256 video_title = mobj.group(1).decode('utf-8')
1257 video_title = sanitize_title(video_title)
1259 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1261 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1263 video_uploader = mobj.group(1)
1266 # Process video information
1267 self._downloader.process_info({
1268 'id': video_id.decode('utf-8'),
1269 'url': video_url.decode('utf-8'),
1270 'uploader': video_uploader.decode('utf-8'),
1271 'upload_date': u'NA',
1272 'title': video_title,
1273 'stitle': simple_title,
1274 'ext': video_extension.decode('utf-8'),
1278 except UnavailableVideoError:
1279 self._downloader.trouble(u'\nERROR: unable to download video')
1282 class DailymotionIE(InfoExtractor):
1283 """Information Extractor for Dailymotion"""
1285 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1287 def __init__(self, downloader=None):
1288 InfoExtractor.__init__(self, downloader)
1292 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1294 def report_download_webpage(self, video_id):
1295 """Report webpage download."""
1296 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1298 def report_extraction(self, video_id):
1299 """Report information extraction."""
1300 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1302 def _real_initialize(self):
1305 def _real_extract(self, url):
1306 # Extract id and simplified title from URL
1307 mobj = re.match(self._VALID_URL, url)
1309 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1312 # At this point we have a new video
1313 self._downloader.increment_downloads()
1314 video_id = mobj.group(1)
1316 simple_title = mobj.group(2).decode('utf-8')
1317 video_extension = 'flv'
1319 # Retrieve video webpage to extract further information
1320 request = urllib2.Request(url)
1322 self.report_download_webpage(video_id)
1323 webpage = urllib2.urlopen(request).read()
1324 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1325 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1328 # Extract URL, uploader and title from webpage
1329 self.report_extraction(video_id)
1330 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1332 self._downloader.trouble(u'ERROR: unable to extract media URL')
1334 mediaURL = urllib.unquote(mobj.group(1))
1336 # if needed add http://www.dailymotion.com/ if relative URL
1338 video_url = mediaURL
1340 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1341 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1343 self._downloader.trouble(u'ERROR: unable to extract title')
1345 video_title = mobj.group(1).decode('utf-8')
1346 video_title = sanitize_title(video_title)
1348 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1350 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1352 video_uploader = mobj.group(1)
1355 # Process video information
1356 self._downloader.process_info({
1357 'id': video_id.decode('utf-8'),
1358 'url': video_url.decode('utf-8'),
1359 'uploader': video_uploader.decode('utf-8'),
1360 'upload_date': u'NA',
1361 'title': video_title,
1362 'stitle': simple_title,
1363 'ext': video_extension.decode('utf-8'),
1367 except UnavailableVideoError:
1368 self._downloader.trouble(u'\nERROR: unable to download video')
1370 class GoogleIE(InfoExtractor):
1371 """Information extractor for video.google.com."""
1373 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1375 def __init__(self, downloader=None):
1376 InfoExtractor.__init__(self, downloader)
1380 return (re.match(GoogleIE._VALID_URL, url) is not None)
1382 def report_download_webpage(self, video_id):
1383 """Report webpage download."""
1384 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1386 def report_extraction(self, video_id):
1387 """Report information extraction."""
1388 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1390 def _real_initialize(self):
1393 def _real_extract(self, url):
1394 # Extract id from URL
1395 mobj = re.match(self._VALID_URL, url)
1397 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1400 # At this point we have a new video
1401 self._downloader.increment_downloads()
1402 video_id = mobj.group(1)
1404 video_extension = 'mp4'
1406 # Retrieve video webpage to extract further information
1407 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1409 self.report_download_webpage(video_id)
1410 webpage = urllib2.urlopen(request).read()
1411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1412 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1415 # Extract URL, uploader, and title from webpage
1416 self.report_extraction(video_id)
1417 mobj = re.search(r"download_url:'([^']+)'", webpage)
1419 video_extension = 'flv'
1420 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1422 self._downloader.trouble(u'ERROR: unable to extract media URL')
1424 mediaURL = urllib.unquote(mobj.group(1))
1425 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1426 mediaURL = mediaURL.replace('\\x26', '\x26')
1428 video_url = mediaURL
1430 mobj = re.search(r'<title>(.*)</title>', webpage)
1432 self._downloader.trouble(u'ERROR: unable to extract title')
1434 video_title = mobj.group(1).decode('utf-8')
1435 video_title = sanitize_title(video_title)
1436 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1438 # Extract video description
1439 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1441 self._downloader.trouble(u'ERROR: unable to extract video description')
1443 video_description = mobj.group(1).decode('utf-8')
1444 if not video_description:
1445 video_description = 'No description available.'
1447 # Extract video thumbnail
1448 if self._downloader.params.get('forcethumbnail', False):
1449 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1451 webpage = urllib2.urlopen(request).read()
1452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1455 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1457 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1459 video_thumbnail = mobj.group(1)
1460 else: # we need something to pass to process_info
1461 video_thumbnail = ''
1465 # Process video information
1466 self._downloader.process_info({
1467 'id': video_id.decode('utf-8'),
1468 'url': video_url.decode('utf-8'),
1470 'upload_date': u'NA',
1471 'title': video_title,
1472 'stitle': simple_title,
1473 'ext': video_extension.decode('utf-8'),
1477 except UnavailableVideoError:
1478 self._downloader.trouble(u'\nERROR: unable to download video')
1481 class PhotobucketIE(InfoExtractor):
1482 """Information extractor for photobucket.com."""
1484 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1486 def __init__(self, downloader=None):
1487 InfoExtractor.__init__(self, downloader)
1491 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1493 def report_download_webpage(self, video_id):
1494 """Report webpage download."""
1495 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1497 def report_extraction(self, video_id):
1498 """Report information extraction."""
1499 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1501 def _real_initialize(self):
1504 def _real_extract(self, url):
1505 # Extract id from URL
1506 mobj = re.match(self._VALID_URL, url)
1508 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1511 # At this point we have a new video
1512 self._downloader.increment_downloads()
1513 video_id = mobj.group(1)
1515 video_extension = 'flv'
1517 # Retrieve video webpage to extract further information
1518 request = urllib2.Request(url)
1520 self.report_download_webpage(video_id)
1521 webpage = urllib2.urlopen(request).read()
1522 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1523 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1526 # Extract URL, uploader, and title from webpage
1527 self.report_extraction(video_id)
1528 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1530 self._downloader.trouble(u'ERROR: unable to extract media URL')
1532 mediaURL = urllib.unquote(mobj.group(1))
1534 video_url = mediaURL
1536 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1538 self._downloader.trouble(u'ERROR: unable to extract title')
1540 video_title = mobj.group(1).decode('utf-8')
1541 video_title = sanitize_title(video_title)
1542 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1544 video_uploader = mobj.group(2).decode('utf-8')
1547 # Process video information
1548 self._downloader.process_info({
1549 'id': video_id.decode('utf-8'),
1550 'url': video_url.decode('utf-8'),
1551 'uploader': video_uploader,
1552 'upload_date': u'NA',
1553 'title': video_title,
1554 'stitle': simple_title,
1555 'ext': video_extension.decode('utf-8'),
1559 except UnavailableVideoError:
1560 self._downloader.trouble(u'\nERROR: unable to download video')
1563 class YahooIE(InfoExtractor):
1564 """Information extractor for video.yahoo.com."""
1566 # _VALID_URL matches all Yahoo! Video URLs
1567 # _VPAGE_URL matches only the extractable '/watch/' URLs
1568 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1569 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1571 def __init__(self, downloader=None):
1572 InfoExtractor.__init__(self, downloader)
1576 return (re.match(YahooIE._VALID_URL, url) is not None)
1578 def report_download_webpage(self, video_id):
1579 """Report webpage download."""
1580 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1582 def report_extraction(self, video_id):
1583 """Report information extraction."""
1584 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1586 def _real_initialize(self):
1589 def _real_extract(self, url, new_video=True):
1590 # Extract ID from URL
1591 mobj = re.match(self._VALID_URL, url)
1593 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1596 # At this point we have a new video
1597 self._downloader.increment_downloads()
1598 video_id = mobj.group(2)
1599 video_extension = 'flv'
1601 # Rewrite valid but non-extractable URLs as
1602 # extractable English language /watch/ URLs
1603 if re.match(self._VPAGE_URL, url) is None:
1604 request = urllib2.Request(url)
1606 webpage = urllib2.urlopen(request).read()
1607 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1608 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1611 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1613 self._downloader.trouble(u'ERROR: Unable to extract id field')
1615 yahoo_id = mobj.group(1)
1617 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1619 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1621 yahoo_vid = mobj.group(1)
1623 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1624 return self._real_extract(url, new_video=False)
1626 # Retrieve video webpage to extract further information
1627 request = urllib2.Request(url)
1629 self.report_download_webpage(video_id)
1630 webpage = urllib2.urlopen(request).read()
1631 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1632 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1635 # Extract uploader and title from webpage
1636 self.report_extraction(video_id)
1637 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1639 self._downloader.trouble(u'ERROR: unable to extract video title')
1641 video_title = mobj.group(1).decode('utf-8')
1642 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1644 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1646 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1648 video_uploader = mobj.group(1).decode('utf-8')
1650 # Extract video thumbnail
1651 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1653 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1655 video_thumbnail = mobj.group(1).decode('utf-8')
1657 # Extract video description
1658 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1660 self._downloader.trouble(u'ERROR: unable to extract video description')
1662 video_description = mobj.group(1).decode('utf-8')
1663 if not video_description: video_description = 'No description available.'
1665 # Extract video height and width
1666 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1668 self._downloader.trouble(u'ERROR: unable to extract video height')
1670 yv_video_height = mobj.group(1)
1672 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1674 self._downloader.trouble(u'ERROR: unable to extract video width')
1676 yv_video_width = mobj.group(1)
1678 # Retrieve video playlist to extract media URL
1679 # I'm not completely sure what all these options are, but we
1680 # seem to need most of them, otherwise the server sends a 401.
1681 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1682 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1683 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1684 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1685 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1687 self.report_download_webpage(video_id)
1688 webpage = urllib2.urlopen(request).read()
1689 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1690 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1693 # Extract media URL from playlist XML
1694 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1696 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1698 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1699 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1702 # Process video information
1703 self._downloader.process_info({
1704 'id': video_id.decode('utf-8'),
1706 'uploader': video_uploader,
1707 'upload_date': u'NA',
1708 'title': video_title,
1709 'stitle': simple_title,
1710 'ext': video_extension.decode('utf-8'),
1711 'thumbnail': video_thumbnail.decode('utf-8'),
1712 'description': video_description,
1713 'thumbnail': video_thumbnail,
1714 'description': video_description,
1717 except UnavailableVideoError:
1718 self._downloader.trouble(u'\nERROR: unable to download video')
1721 class VimeoIE(InfoExtractor):
1722 """Information extractor for vimeo.com."""
1724 # _VALID_URL matches Vimeo URLs
1725 _VALID_URL = r'(?:http://)?vimeo\.com/([0-9]+)'
1727 def __init__(self, downloader=None):
1728 InfoExtractor.__init__(self, downloader)
1732 return (re.match(VimeoIE._VALID_URL, url) is not None)
1734 def report_download_webpage(self, video_id):
1735 """Report webpage download."""
1736 self._downloader.to_screen(u'[video.vimeo] %s: Downloading webpage' % video_id)
1738 def report_extraction(self, video_id):
1739 """Report information extraction."""
1740 self._downloader.to_screen(u'[video.vimeo] %s: Extracting information' % video_id)
1742 def _real_initialize(self):
1745 def _real_extract(self, url, new_video=True):
1746 # Extract ID from URL
1747 mobj = re.match(self._VALID_URL, url)
1749 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1752 # At this point we have a new video
1753 self._downloader.increment_downloads()
1754 video_id = mobj.group(1)
1755 video_extension = 'flv' # FIXME
1757 # Retrieve video webpage to extract further information
1758 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1760 self.report_download_webpage(video_id)
1761 webpage = urllib2.urlopen(request).read()
1762 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1763 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1766 # Extract uploader and title from webpage
1767 self.report_extraction(video_id)
1768 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
1770 self._downloader.trouble(u'ERROR: unable to extract video title')
1772 video_title = mobj.group(1).decode('utf-8')
1773 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1775 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
1777 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1779 video_uploader = mobj.group(1).decode('utf-8')
1781 # Extract video thumbnail
1782 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
1784 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1786 video_thumbnail = mobj.group(1).decode('utf-8')
1788 # # Extract video description
1789 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
1791 # self._downloader.trouble(u'ERROR: unable to extract video description')
1793 # video_description = mobj.group(1).decode('utf-8')
1794 # if not video_description: video_description = 'No description available.'
1795 video_description = 'Foo.'
1797 # Extract request signature
1798 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
1800 self._downloader.trouble(u'ERROR: unable to extract request signature')
1802 sig = mobj.group(1).decode('utf-8')
1804 # Extract request signature expiration
1805 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
1807 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
1809 sig_exp = mobj.group(1).decode('utf-8')
1811 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
1814 # Process video information
1815 self._downloader.process_info({
1816 'id': video_id.decode('utf-8'),
1818 'uploader': video_uploader,
1819 'upload_date': u'NA',
1820 'title': video_title,
1821 'stitle': simple_title,
1822 'ext': video_extension.decode('utf-8'),
1823 'thumbnail': video_thumbnail.decode('utf-8'),
1824 'description': video_description,
1825 'thumbnail': video_thumbnail,
1826 'description': video_description,
1829 except UnavailableVideoError:
1830 self._downloader.trouble(u'ERROR: unable to download video')
1833 class GenericIE(InfoExtractor):
1834 """Generic last-resort information extractor."""
1836 def __init__(self, downloader=None):
1837 InfoExtractor.__init__(self, downloader)
1843 def report_download_webpage(self, video_id):
1844 """Report webpage download."""
1845 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1846 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1848 def report_extraction(self, video_id):
1849 """Report information extraction."""
1850 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1852 def _real_initialize(self):
1855 def _real_extract(self, url):
1856 # At this point we have a new video
1857 self._downloader.increment_downloads()
1859 video_id = url.split('/')[-1]
1860 request = urllib2.Request(url)
1862 self.report_download_webpage(video_id)
1863 webpage = urllib2.urlopen(request).read()
1864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1867 except ValueError, err:
1868 # since this is the last-resort InfoExtractor, if
1869 # this error is thrown, it'll be thrown here
1870 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1873 self.report_extraction(video_id)
1874 # Start with something easy: JW Player in SWFObject
1875 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1877 # Broaden the search a little bit
1878 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1880 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1883 # It's possible that one of the regexes
1884 # matched, but returned an empty group:
1885 if mobj.group(1) is None:
1886 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1889 video_url = urllib.unquote(mobj.group(1))
1890 video_id = os.path.basename(video_url)
1892 # here's a fun little line of code for you:
1893 video_extension = os.path.splitext(video_id)[1][1:]
1894 video_id = os.path.splitext(video_id)[0]
1896 # it's tempting to parse this further, but you would
1897 # have to take into account all the variations like
1898 # Video Title - Site Name
1899 # Site Name | Video Title
1900 # Video Title - Tagline | Site Name
1901 # and so on and so forth; it's just not practical
1902 mobj = re.search(r'<title>(.*)</title>', webpage)
1904 self._downloader.trouble(u'ERROR: unable to extract title')
1906 video_title = mobj.group(1).decode('utf-8')
1907 video_title = sanitize_title(video_title)
1908 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1910 # video uploader is domain name
1911 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1913 self._downloader.trouble(u'ERROR: unable to extract title')
1915 video_uploader = mobj.group(1).decode('utf-8')
1918 # Process video information
1919 self._downloader.process_info({
1920 'id': video_id.decode('utf-8'),
1921 'url': video_url.decode('utf-8'),
1922 'uploader': video_uploader,
1923 'upload_date': u'NA',
1924 'title': video_title,
1925 'stitle': simple_title,
1926 'ext': video_extension.decode('utf-8'),
1930 except UnavailableVideoError, err:
1931 self._downloader.trouble(u'\nERROR: unable to download video')
1934 class YoutubeSearchIE(InfoExtractor):
1935 """Information Extractor for YouTube search queries."""
1936 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1937 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1938 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1939 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1941 _max_youtube_results = 1000
1943 def __init__(self, youtube_ie, downloader=None):
1944 InfoExtractor.__init__(self, downloader)
1945 self._youtube_ie = youtube_ie
1949 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1951 def report_download_page(self, query, pagenum):
1952 """Report attempt to download playlist page with given number."""
1953 query = query.decode(preferredencoding())
1954 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1956 def _real_initialize(self):
1957 self._youtube_ie.initialize()
1959 def _real_extract(self, query):
1960 mobj = re.match(self._VALID_QUERY, query)
1962 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1965 prefix, query = query.split(':')
1967 query = query.encode('utf-8')
1969 self._download_n_results(query, 1)
1971 elif prefix == 'all':
1972 self._download_n_results(query, self._max_youtube_results)
1978 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1980 elif n > self._max_youtube_results:
1981 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1982 n = self._max_youtube_results
1983 self._download_n_results(query, n)
1985 except ValueError: # parsing prefix as integer fails
1986 self._download_n_results(query, 1)
1989 def _download_n_results(self, query, n):
1990 """Downloads a specified number of results for a query"""
1993 already_seen = set()
1997 self.report_download_page(query, pagenum)
1998 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1999 request = urllib2.Request(result_url)
2001 page = urllib2.urlopen(request).read()
2002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2003 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2006 # Extract video identifiers
2007 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2008 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2009 if video_id not in already_seen:
2010 video_ids.append(video_id)
2011 already_seen.add(video_id)
2012 if len(video_ids) == n:
2013 # Specified n videos reached
2014 for id in video_ids:
2015 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2018 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2019 for id in video_ids:
2020 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2023 pagenum = pagenum + 1
2025 class GoogleSearchIE(InfoExtractor):
2026 """Information Extractor for Google Video search queries."""
2027 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2028 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2029 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2030 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2032 _max_google_results = 1000
2034 def __init__(self, google_ie, downloader=None):
2035 InfoExtractor.__init__(self, downloader)
2036 self._google_ie = google_ie
2040 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2042 def report_download_page(self, query, pagenum):
2043 """Report attempt to download playlist page with given number."""
2044 query = query.decode(preferredencoding())
2045 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2047 def _real_initialize(self):
2048 self._google_ie.initialize()
2050 def _real_extract(self, query):
2051 mobj = re.match(self._VALID_QUERY, query)
2053 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2056 prefix, query = query.split(':')
2058 query = query.encode('utf-8')
2060 self._download_n_results(query, 1)
2062 elif prefix == 'all':
2063 self._download_n_results(query, self._max_google_results)
2069 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2071 elif n > self._max_google_results:
2072 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2073 n = self._max_google_results
2074 self._download_n_results(query, n)
2076 except ValueError: # parsing prefix as integer fails
2077 self._download_n_results(query, 1)
2080 def _download_n_results(self, query, n):
2081 """Downloads a specified number of results for a query"""
2084 already_seen = set()
2088 self.report_download_page(query, pagenum)
2089 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2090 request = urllib2.Request(result_url)
2092 page = urllib2.urlopen(request).read()
2093 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2094 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2097 # Extract video identifiers
2098 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2099 video_id = mobj.group(1)
2100 if video_id not in already_seen:
2101 video_ids.append(video_id)
2102 already_seen.add(video_id)
2103 if len(video_ids) == n:
2104 # Specified n videos reached
2105 for id in video_ids:
2106 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2109 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2110 for id in video_ids:
2111 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2114 pagenum = pagenum + 1
2116 class YahooSearchIE(InfoExtractor):
2117 """Information Extractor for Yahoo! Video search queries."""
2118 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2119 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2120 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2121 _MORE_PAGES_INDICATOR = r'\s*Next'
2123 _max_yahoo_results = 1000
2125 def __init__(self, yahoo_ie, downloader=None):
2126 InfoExtractor.__init__(self, downloader)
2127 self._yahoo_ie = yahoo_ie
2131 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2133 def report_download_page(self, query, pagenum):
2134 """Report attempt to download playlist page with given number."""
2135 query = query.decode(preferredencoding())
2136 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2138 def _real_initialize(self):
2139 self._yahoo_ie.initialize()
2141 def _real_extract(self, query):
2142 mobj = re.match(self._VALID_QUERY, query)
2144 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2147 prefix, query = query.split(':')
2149 query = query.encode('utf-8')
2151 self._download_n_results(query, 1)
2153 elif prefix == 'all':
2154 self._download_n_results(query, self._max_yahoo_results)
2160 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2162 elif n > self._max_yahoo_results:
2163 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2164 n = self._max_yahoo_results
2165 self._download_n_results(query, n)
2167 except ValueError: # parsing prefix as integer fails
2168 self._download_n_results(query, 1)
2171 def _download_n_results(self, query, n):
2172 """Downloads a specified number of results for a query"""
2175 already_seen = set()
2179 self.report_download_page(query, pagenum)
2180 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2181 request = urllib2.Request(result_url)
2183 page = urllib2.urlopen(request).read()
2184 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2185 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2188 # Extract video identifiers
2189 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2190 video_id = mobj.group(1)
2191 if video_id not in already_seen:
2192 video_ids.append(video_id)
2193 already_seen.add(video_id)
2194 if len(video_ids) == n:
2195 # Specified n videos reached
2196 for id in video_ids:
2197 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2200 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2201 for id in video_ids:
2202 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2205 pagenum = pagenum + 1
2207 class YoutubePlaylistIE(InfoExtractor):
2208 """Information Extractor for YouTube playlists."""
2210 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2211 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2212 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2213 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2216 def __init__(self, youtube_ie, downloader=None):
2217 InfoExtractor.__init__(self, downloader)
2218 self._youtube_ie = youtube_ie
2222 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2224 def report_download_page(self, playlist_id, pagenum):
2225 """Report attempt to download playlist page with given number."""
2226 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2228 def _real_initialize(self):
2229 self._youtube_ie.initialize()
2231 def _real_extract(self, url):
2232 # Extract playlist id
2233 mobj = re.match(self._VALID_URL, url)
2235 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2238 # Download playlist pages
2239 playlist_id = mobj.group(1)
2244 self.report_download_page(playlist_id, pagenum)
2245 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2247 page = urllib2.urlopen(request).read()
2248 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2249 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2252 # Extract video identifiers
2254 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2255 if mobj.group(1) not in ids_in_page:
2256 ids_in_page.append(mobj.group(1))
2257 video_ids.extend(ids_in_page)
2259 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2261 pagenum = pagenum + 1
2263 playliststart = self._downloader.params.get('playliststart', 1) - 1
2264 playlistend = self._downloader.params.get('playlistend', -1)
2265 video_ids = video_ids[playliststart:playlistend]
2267 for id in video_ids:
2268 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2271 class YoutubeUserIE(InfoExtractor):
2272 """Information Extractor for YouTube users."""
2274 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2275 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2276 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2279 def __init__(self, youtube_ie, downloader=None):
2280 InfoExtractor.__init__(self, downloader)
2281 self._youtube_ie = youtube_ie
2285 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2287 def report_download_page(self, username):
2288 """Report attempt to download user page."""
2289 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2291 def _real_initialize(self):
2292 self._youtube_ie.initialize()
2294 def _real_extract(self, url):
2296 mobj = re.match(self._VALID_URL, url)
2298 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2301 # Download user page
2302 username = mobj.group(1)
2306 self.report_download_page(username)
2307 request = urllib2.Request(self._TEMPLATE_URL % (username))
2309 page = urllib2.urlopen(request).read()
2310 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2311 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2314 # Extract video identifiers
2317 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2318 if mobj.group(1) not in ids_in_page:
2319 ids_in_page.append(mobj.group(1))
2320 video_ids.extend(ids_in_page)
2322 playliststart = self._downloader.params.get('playliststart', 1) - 1
2323 playlistend = self._downloader.params.get('playlistend', -1)
2324 video_ids = video_ids[playliststart:playlistend]
2326 for id in video_ids:
2327 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2330 class DepositFilesIE(InfoExtractor):
2331 """Information extractor for depositfiles.com"""
2333 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2335 def __init__(self, downloader=None):
2336 InfoExtractor.__init__(self, downloader)
2340 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2342 def report_download_webpage(self, file_id):
2343 """Report webpage download."""
2344 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2346 def report_extraction(self, file_id):
2347 """Report information extraction."""
2348 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2350 def _real_initialize(self):
2353 def _real_extract(self, url):
2354 # At this point we have a new file
2355 self._downloader.increment_downloads()
2357 file_id = url.split('/')[-1]
2358 # Rebuild url in english locale
2359 url = 'http://depositfiles.com/en/files/' + file_id
2361 # Retrieve file webpage with 'Free download' button pressed
2362 free_download_indication = { 'gateway_result' : '1' }
2363 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2365 self.report_download_webpage(file_id)
2366 webpage = urllib2.urlopen(request).read()
2367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2368 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2371 # Search for the real file URL
2372 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2373 if (mobj is None) or (mobj.group(1) is None):
2374 # Try to figure out reason of the error.
2375 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2376 if (mobj is not None) and (mobj.group(1) is not None):
2377 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2378 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2380 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2383 file_url = mobj.group(1)
2384 file_extension = os.path.splitext(file_url)[1][1:]
2386 # Search for file title
2387 mobj = re.search(r'<b title="(.*?)">', webpage)
2389 self._downloader.trouble(u'ERROR: unable to extract title')
2391 file_title = mobj.group(1).decode('utf-8')
2394 # Process file information
2395 self._downloader.process_info({
2396 'id': file_id.decode('utf-8'),
2397 'url': file_url.decode('utf-8'),
2399 'upload_date': u'NA',
2400 'title': file_title,
2401 'stitle': file_title,
2402 'ext': file_extension.decode('utf-8'),
2406 except UnavailableVideoError, err:
2407 self._downloader.trouble(u'ERROR: unable to download file')
2409 class PostProcessor(object):
2410 """Post Processor class.
2412 PostProcessor objects can be added to downloaders with their
2413 add_post_processor() method. When the downloader has finished a
2414 successful download, it will take its internal chain of PostProcessors
2415 and start calling the run() method on each one of them, first with
2416 an initial argument and then with the returned value of the previous
2419 The chain will be stopped if one of them ever returns None or the end
2420 of the chain is reached.
2422 PostProcessor objects follow a "mutual registration" process similar
2423 to InfoExtractor objects.
2428 def __init__(self, downloader=None):
2429 self._downloader = downloader
2431 def set_downloader(self, downloader):
2432 """Sets the downloader for this PP."""
2433 self._downloader = downloader
2435 def run(self, information):
2436 """Run the PostProcessor.
2438 The "information" argument is a dictionary like the ones
2439 composed by InfoExtractors. The only difference is that this
2440 one has an extra field called "filepath" that points to the
2443 When this method returns None, the postprocessing chain is
2444 stopped. However, this method may return an information
2445 dictionary that will be passed to the next postprocessing
2446 object in the chain. It can be the one it received after
2447 changing some fields.
2449 In addition, this method may raise a PostProcessingError
2450 exception that will be taken into account by the downloader
2453 return information # by default, do nothing
2455 ### MAIN PROGRAM ###
2456 if __name__ == '__main__':
2458 # Modules needed only when running the main program
2462 # Function to update the program file with the latest version from the repository.
2463 def update_self(downloader, filename):
2464 # Note: downloader only used for options
2465 if not os.access(filename, os.W_OK):
2466 sys.exit('ERROR: no write permissions on %s' % filename)
2468 downloader.to_screen('Updating to latest stable version...')
2470 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2471 latest_version = urllib.urlopen(latest_url).read().strip()
2472 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2473 newcontent = urllib.urlopen(prog_url).read()
2474 except (IOError, OSError), err:
2475 sys.exit('ERROR: unable to download latest version')
2477 stream = open(filename, 'w')
2478 stream.write(newcontent)
2480 except (IOError, OSError), err:
2481 sys.exit('ERROR: unable to overwrite current version')
2482 downloader.to_screen('Updated to version %s' % latest_version)
2484 # Parse command line
2485 parser = optparse.OptionParser(
2486 usage='Usage: %prog [options] url...',
2487 version='2010.12.09',
2488 conflict_handler='resolve',
2491 parser.add_option('-h', '--help',
2492 action='help', help='print this help text and exit')
2493 parser.add_option('-v', '--version',
2494 action='version', help='print program version and exit')
2495 parser.add_option('-U', '--update',
2496 action='store_true', dest='update_self', help='update this program to latest stable version')
2497 parser.add_option('-i', '--ignore-errors',
2498 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2499 parser.add_option('-r', '--rate-limit',
2500 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2501 parser.add_option('-R', '--retries',
2502 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2503 parser.add_option('--playlist-start',
2504 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2505 parser.add_option('--playlist-end',
2506 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2507 parser.add_option('--dump-user-agent',
2508 action='store_true', dest='dump_user_agent',
2509 help='display the current browser identification', default=False)
2511 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2512 authentication.add_option('-u', '--username',
2513 dest='username', metavar='USERNAME', help='account username')
2514 authentication.add_option('-p', '--password',
2515 dest='password', metavar='PASSWORD', help='account password')
2516 authentication.add_option('-n', '--netrc',
2517 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2518 parser.add_option_group(authentication)
2520 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2521 video_format.add_option('-f', '--format',
2522 action='store', dest='format', metavar='FORMAT', help='video format code')
2523 video_format.add_option('--all-formats',
2524 action='store_const', dest='format', help='download all available video formats', const='-1')
2525 video_format.add_option('--max-quality',
2526 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2527 parser.add_option_group(video_format)
2529 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2530 verbosity.add_option('-q', '--quiet',
2531 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2532 verbosity.add_option('-s', '--simulate',
2533 action='store_true', dest='simulate', help='do not download video', default=False)
2534 verbosity.add_option('-g', '--get-url',
2535 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2536 verbosity.add_option('-e', '--get-title',
2537 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2538 verbosity.add_option('--get-thumbnail',
2539 action='store_true', dest='getthumbnail',
2540 help='simulate, quiet but print thumbnail URL', default=False)
2541 verbosity.add_option('--get-description',
2542 action='store_true', dest='getdescription',
2543 help='simulate, quiet but print video description', default=False)
2544 verbosity.add_option('--get-filename',
2545 action='store_true', dest='getfilename',
2546 help='simulate, quiet but print output filename', default=False)
2547 verbosity.add_option('--no-progress',
2548 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2549 verbosity.add_option('--console-title',
2550 action='store_true', dest='consoletitle',
2551 help='display progress in console titlebar', default=False)
2552 parser.add_option_group(verbosity)
2554 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2555 filesystem.add_option('-t', '--title',
2556 action='store_true', dest='usetitle', help='use title in file name', default=False)
2557 filesystem.add_option('-l', '--literal',
2558 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2559 filesystem.add_option('-A', '--auto-number',
2560 action='store_true', dest='autonumber',
2561 help='number downloaded files starting from 00000', default=False)
2562 filesystem.add_option('-o', '--output',
2563 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2564 filesystem.add_option('-a', '--batch-file',
2565 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2566 filesystem.add_option('-w', '--no-overwrites',
2567 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2568 filesystem.add_option('-c', '--continue',
2569 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2570 filesystem.add_option('--cookies',
2571 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2572 filesystem.add_option('--no-part',
2573 action='store_true', dest='nopart', help='do not use .part files', default=False)
2574 filesystem.add_option('--no-mtime',
2575 action='store_false', dest='updatetime',
2576 help='do not use the Last-modified header to set the file modification time', default=True)
2577 parser.add_option_group(filesystem)
2579 (opts, args) = parser.parse_args()
2581 # Open appropriate CookieJar
2582 if opts.cookiefile is None:
2583 jar = cookielib.CookieJar()
2586 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2587 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2589 except (IOError, OSError), err:
2590 sys.exit(u'ERROR: unable to open cookie file')
2593 if opts.dump_user_agent:
2594 print std_headers['User-Agent']
2597 # General configuration
2598 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2599 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2600 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2602 # Batch file verification
2604 if opts.batchfile is not None:
2606 if opts.batchfile == '-':
2609 batchfd = open(opts.batchfile, 'r')
2610 batchurls = batchfd.readlines()
2611 batchurls = [x.strip() for x in batchurls]
2612 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2614 sys.exit(u'ERROR: batch file could not be read')
2615 all_urls = batchurls + args
2617 # Conflicting, missing and erroneous options
2618 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2619 parser.error(u'using .netrc conflicts with giving username/password')
2620 if opts.password is not None and opts.username is None:
2621 parser.error(u'account username missing')
2622 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2623 parser.error(u'using output template conflicts with using title, literal title or auto number')
2624 if opts.usetitle and opts.useliteral:
2625 parser.error(u'using title conflicts with using literal title')
2626 if opts.username is not None and opts.password is None:
2627 opts.password = getpass.getpass(u'Type account password and press return:')
2628 if opts.ratelimit is not None:
2629 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2630 if numeric_limit is None:
2631 parser.error(u'invalid rate limit specified')
2632 opts.ratelimit = numeric_limit
2633 if opts.retries is not None:
2635 opts.retries = long(opts.retries)
2636 except (TypeError, ValueError), err:
2637 parser.error(u'invalid retry count specified')
2639 opts.playliststart = long(opts.playliststart)
2640 if opts.playliststart <= 0:
2642 except (TypeError, ValueError), err:
2643 parser.error(u'invalid playlist start number specified')
2645 opts.playlistend = long(opts.playlistend)
2646 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2648 except (TypeError, ValueError), err:
2649 parser.error(u'invalid playlist end number specified')
2651 # Information extractors
2652 vimeo_ie = VimeoIE()
2653 youtube_ie = YoutubeIE()
2654 metacafe_ie = MetacafeIE(youtube_ie)
2655 dailymotion_ie = DailymotionIE()
2656 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2657 youtube_user_ie = YoutubeUserIE(youtube_ie)
2658 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2659 google_ie = GoogleIE()
2660 google_search_ie = GoogleSearchIE(google_ie)
2661 photobucket_ie = PhotobucketIE()
2662 yahoo_ie = YahooIE()
2663 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2664 deposit_files_ie = DepositFilesIE()
2665 generic_ie = GenericIE()
2668 fd = FileDownloader({
2669 'usenetrc': opts.usenetrc,
2670 'username': opts.username,
2671 'password': opts.password,
2672 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2673 'forceurl': opts.geturl,
2674 'forcetitle': opts.gettitle,
2675 'forcethumbnail': opts.getthumbnail,
2676 'forcedescription': opts.getdescription,
2677 'forcefilename': opts.getfilename,
2678 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2679 'format': opts.format,
2680 'format_limit': opts.format_limit,
2681 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2682 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2683 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2684 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2685 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2686 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2687 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2688 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2689 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2690 or u'%(id)s.%(ext)s'),
2691 'ignoreerrors': opts.ignoreerrors,
2692 'ratelimit': opts.ratelimit,
2693 'nooverwrites': opts.nooverwrites,
2694 'retries': opts.retries,
2695 'continuedl': opts.continue_dl,
2696 'noprogress': opts.noprogress,
2697 'playliststart': opts.playliststart,
2698 'playlistend': opts.playlistend,
2699 'logtostderr': opts.outtmpl == '-',
2700 'consoletitle': opts.consoletitle,
2701 'nopart': opts.nopart,
2702 'updatetime': opts.updatetime,
2704 fd.add_info_extractor(vimeo_ie)
2705 fd.add_info_extractor(youtube_search_ie)
2706 fd.add_info_extractor(youtube_pl_ie)
2707 fd.add_info_extractor(youtube_user_ie)
2708 fd.add_info_extractor(metacafe_ie)
2709 fd.add_info_extractor(dailymotion_ie)
2710 fd.add_info_extractor(youtube_ie)
2711 fd.add_info_extractor(google_ie)
2712 fd.add_info_extractor(google_search_ie)
2713 fd.add_info_extractor(photobucket_ie)
2714 fd.add_info_extractor(yahoo_ie)
2715 fd.add_info_extractor(yahoo_search_ie)
2716 fd.add_info_extractor(deposit_files_ie)
2718 # This must come last since it's the
2719 # fallback if none of the others work
2720 fd.add_info_extractor(generic_ie)
2723 if opts.update_self:
2724 update_self(fd, sys.argv[0])
2727 if len(all_urls) < 1:
2728 if not opts.update_self:
2729 parser.error(u'you must provide at least one URL')
2732 retcode = fd.download(all_urls)
2734 # Dump cookie jar if requested
2735 if opts.cookiefile is not None:
2738 except (IOError, OSError), err:
2739 sys.exit(u'ERROR: unable to save cookie jar')
2743 except DownloadError:
2745 except SameFileError:
2746 sys.exit(u'ERROR: fixed output name but more than one file to download')
2747 except KeyboardInterrupt:
2748 sys.exit(u'\nERROR: Interrupted by user')