2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # License: Public domain code
18 import json # TODO: json for 2.5
35 # parse_qs was moved from the cgi module to the urlparse module recently.
37 from urlparse import parse_qs
39 from cgi import parse_qs
42 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:2.0b11) Gecko/20100101 Firefox/4.0b11',
43 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
44 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 'Accept-Encoding': 'gzip, deflate',
46 'Accept-Language': 'en-us,en;q=0.5',
49 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
51 def preferredencoding():
52 """Get preferred encoding.
54 Returns the best encoding scheme for the system, based on
55 locale.getpreferredencoding() and some further tweaks.
57 def yield_preferredencoding():
59 pref = locale.getpreferredencoding()
65 return yield_preferredencoding().next()
67 def htmlentity_transform(matchobj):
68 """Transforms an HTML entity to a Unicode character.
70 This function receives a match object and is intended to be used with
71 the re.sub() function.
73 entity = matchobj.group(1)
75 # Known non-numeric HTML entity
76 if entity in htmlentitydefs.name2codepoint:
77 return unichr(htmlentitydefs.name2codepoint[entity])
80 mobj = re.match(ur'(?u)#(x?\d+)', entity)
82 numstr = mobj.group(1)
83 if numstr.startswith(u'x'):
85 numstr = u'0%s' % numstr
88 return unichr(long(numstr, base))
90 # Unknown entity in name, return its literal representation
91 return (u'&%s;' % entity)
93 def sanitize_title(utitle):
94 """Sanitizes a video title so it could be used as part of a filename."""
95 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
96 return utitle.replace(unicode(os.sep), u'%')
98 def sanitize_open(filename, open_mode):
99 """Try to open the given filename, and slightly tweak it if this fails.
101 Attempts to open the given filename. If this fails, it tries to change
102 the filename slightly, step by step, until it's either able to open it
103 or it fails and raises a final exception, like the standard open()
106 It returns the tuple (stream, definitive_file_name).
110 if sys.platform == 'win32':
112 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
113 return (sys.stdout, filename)
114 stream = open(filename, open_mode)
115 return (stream, filename)
116 except (IOError, OSError), err:
117 # In case of error, try to remove win32 forbidden chars
118 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
120 # An exception here should be caught in the caller
121 stream = open(filename, open_mode)
122 return (stream, filename)
124 def timeconvert(timestr):
125 """Convert RFC 2822 defined time string into system timestamp"""
127 timetuple = email.utils.parsedate_tz(timestr)
128 if timetuple is not None:
129 timestamp = email.utils.mktime_tz(timetuple)
132 class DownloadError(Exception):
133 """Download Error exception.
135 This exception may be thrown by FileDownloader objects if they are not
136 configured to continue on errors. They will contain the appropriate
141 class SameFileError(Exception):
142 """Same File exception.
144 This exception will be thrown by FileDownloader objects if they detect
145 multiple files would have to be downloaded to the same file on disk.
149 class PostProcessingError(Exception):
150 """Post Processing exception.
152 This exception may be raised by PostProcessor's .run() method to
153 indicate an error in the postprocessing task.
157 class UnavailableVideoError(Exception):
158 """Unavailable Format exception.
160 This exception will be thrown when a video is requested
161 in a format that is not available for that video.
165 class ContentTooShortError(Exception):
166 """Content Too Short exception.
168 This exception may be raised by FileDownloader objects when a file they
169 download is too small for what the server announced first, indicating
170 the connection was probably interrupted.
176 def __init__(self, downloaded, expected):
177 self.downloaded = downloaded
178 self.expected = expected
180 class YoutubeDLHandler(urllib2.HTTPHandler):
181 """Handler for HTTP requests and responses.
183 This class, when installed with an OpenerDirector, automatically adds
184 the standard headers to every HTTP request and handles gzipped and
185 deflated responses from web servers. If compression is to be avoided in
186 a particular request, the original request in the program code only has
187 to include the HTTP header "Youtubedl-No-Compression", which will be
188 removed before making the real request.
190 Part of this code was copied from:
192 http://techknack.net/python-urllib2-handlers/
194 Andrew Rowls, the author of that code, agreed to release it to the
201 return zlib.decompress(data, -zlib.MAX_WBITS)
203 return zlib.decompress(data)
206 def addinfourl_wrapper(stream, headers, url, code):
207 if hasattr(urllib2.addinfourl, 'getcode'):
208 return urllib2.addinfourl(stream, headers, url, code)
209 ret = urllib2.addinfourl(stream, headers, url)
213 def http_request(self, req):
214 for h in std_headers:
217 req.add_header(h, std_headers[h])
218 if 'Youtubedl-no-compression' in req.headers:
219 if 'Accept-encoding' in req.headers:
220 del req.headers['Accept-encoding']
221 del req.headers['Youtubedl-no-compression']
224 def http_response(self, req, resp):
227 if resp.headers.get('Content-encoding', '') == 'gzip':
228 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
229 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
230 resp.msg = old_resp.msg
232 if resp.headers.get('Content-encoding', '') == 'deflate':
233 gz = StringIO.StringIO(self.deflate(resp.read()))
234 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
235 resp.msg = old_resp.msg
238 class FileDownloader(object):
239 """File Downloader class.
241 File downloader objects are the ones responsible of downloading the
242 actual video file and writing it to disk if the user has requested
243 it, among some other tasks. In most cases there should be one per
244 program. As, given a video URL, the downloader doesn't know how to
245 extract all the needed information, task that InfoExtractors do, it
246 has to pass the URL to one of them.
248 For this, file downloader objects have a method that allows
249 InfoExtractors to be registered in a given order. When it is passed
250 a URL, the file downloader handles it to the first InfoExtractor it
251 finds that reports being able to handle it. The InfoExtractor extracts
252 all the information about the video or videos the URL refers to, and
253 asks the FileDownloader to process the video information, possibly
254 downloading the video.
256 File downloaders accept a lot of parameters. In order not to saturate
257 the object constructor with arguments, it receives a dictionary of
258 options instead. These options are available through the params
259 attribute for the InfoExtractors to use. The FileDownloader also
260 registers itself as the downloader in charge for the InfoExtractors
261 that are added to it, so this is a "mutual registration".
265 username: Username for authentication purposes.
266 password: Password for authentication purposes.
267 usenetrc: Use netrc for authentication instead.
268 quiet: Do not print messages to stdout.
269 forceurl: Force printing final URL.
270 forcetitle: Force printing title.
271 forcethumbnail: Force printing thumbnail URL.
272 forcedescription: Force printing description.
273 forcefilename: Force printing final filename.
274 simulate: Do not download the video files.
275 format: Video format code.
276 format_limit: Highest quality format to try.
277 outtmpl: Template for output names.
278 ignoreerrors: Do not stop on download errors.
279 ratelimit: Download speed limit, in bytes/sec.
280 nooverwrites: Prevent overwriting files.
281 retries: Number of times to retry for HTTP error 5xx
282 continuedl: Try to continue downloads if possible.
283 noprogress: Do not print the progress bar.
284 playliststart: Playlist item to start at.
285 playlistend: Playlist item to end at.
286 logtostderr: Log messages to stderr instead of stdout.
287 consoletitle: Display progress in console window's titlebar.
288 nopart: Do not use temporary .part files.
289 updatetime: Use the Last-modified header to set output file timestamps.
295 _download_retcode = None
296 _num_downloads = None
299 def __init__(self, params):
300 """Create a FileDownloader object with the given options."""
303 self._download_retcode = 0
304 self._num_downloads = 0
305 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
309 def pmkdir(filename):
310 """Create directory components in filename. Similar to Unix "mkdir -p"."""
311 components = filename.split(os.sep)
312 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
313 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
314 for dir in aggregate:
315 if not os.path.exists(dir):
319 def format_bytes(bytes):
322 if type(bytes) is str:
327 exponent = long(math.log(bytes, 1024.0))
328 suffix = 'bkMGTPEZY'[exponent]
329 converted = float(bytes) / float(1024**exponent)
330 return '%.2f%s' % (converted, suffix)
333 def calc_percent(byte_counter, data_len):
336 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
339 def calc_eta(start, now, total, current):
343 if current == 0 or dif < 0.001: # One millisecond
345 rate = float(current) / dif
346 eta = long((float(total) - float(current)) / rate)
347 (eta_mins, eta_secs) = divmod(eta, 60)
350 return '%02d:%02d' % (eta_mins, eta_secs)
353 def calc_speed(start, now, bytes):
355 if bytes == 0 or dif < 0.001: # One millisecond
356 return '%10s' % '---b/s'
357 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
360 def best_block_size(elapsed_time, bytes):
361 new_min = max(bytes / 2.0, 1.0)
362 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
363 if elapsed_time < 0.001:
365 rate = bytes / elapsed_time
373 def parse_bytes(bytestr):
374 """Parse a string indicating a byte quantity into a long integer."""
375 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
378 number = float(matchobj.group(1))
379 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
380 return long(round(number * multiplier))
382 def add_info_extractor(self, ie):
383 """Add an InfoExtractor object to the end of the list."""
385 ie.set_downloader(self)
387 def add_post_processor(self, pp):
388 """Add a PostProcessor object to the end of the chain."""
390 pp.set_downloader(self)
392 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
393 """Print message to stdout if not in quiet mode."""
395 if not self.params.get('quiet', False):
396 terminator = [u'\n', u''][skip_eol]
397 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
398 self._screen_file.flush()
399 except (UnicodeEncodeError), err:
400 if not ignore_encoding_errors:
403 def to_stderr(self, message):
404 """Print message to stderr."""
405 print >>sys.stderr, message.encode(preferredencoding())
407 def to_cons_title(self, message):
408 """Set console/terminal window title to message."""
409 if not self.params.get('consoletitle', False):
411 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
412 # c_wchar_p() might not be necessary if `message` is
413 # already of type unicode()
414 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
415 elif 'TERM' in os.environ:
416 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
418 def fixed_template(self):
419 """Checks if the output template is fixed."""
420 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
422 def trouble(self, message=None):
423 """Determine action to take when a download problem appears.
425 Depending on if the downloader has been configured to ignore
426 download errors or not, this method may throw an exception or
427 not when errors are found, after printing the message.
429 if message is not None:
430 self.to_stderr(message)
431 if not self.params.get('ignoreerrors', False):
432 raise DownloadError(message)
433 self._download_retcode = 1
435 def slow_down(self, start_time, byte_counter):
436 """Sleep if the download speed is over the rate limit."""
437 rate_limit = self.params.get('ratelimit', None)
438 if rate_limit is None or byte_counter == 0:
441 elapsed = now - start_time
444 speed = float(byte_counter) / elapsed
445 if speed > rate_limit:
446 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
448 def temp_name(self, filename):
449 """Returns a temporary filename for the given filename."""
450 if self.params.get('nopart', False) or filename == u'-' or \
451 (os.path.exists(filename) and not os.path.isfile(filename)):
453 return filename + u'.part'
455 def undo_temp_name(self, filename):
456 if filename.endswith(u'.part'):
457 return filename[:-len(u'.part')]
460 def try_rename(self, old_filename, new_filename):
462 if old_filename == new_filename:
464 os.rename(old_filename, new_filename)
465 except (IOError, OSError), err:
466 self.trouble(u'ERROR: unable to rename file')
468 def try_utime(self, filename, last_modified_hdr):
469 """Try to set the last-modified time of the given file."""
470 if last_modified_hdr is None:
472 if not os.path.isfile(filename):
474 timestr = last_modified_hdr
477 filetime = timeconvert(timestr)
481 os.utime(filename,(time.time(), filetime))
485 def report_destination(self, filename):
486 """Report destination filename."""
487 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
489 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
490 """Report download progress."""
491 if self.params.get('noprogress', False):
493 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
494 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
495 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
496 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
498 def report_resuming_byte(self, resume_len):
499 """Report attempt to resume at given byte."""
500 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
502 def report_retry(self, count, retries):
503 """Report retry in case of HTTP error 5xx"""
504 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
506 def report_file_already_downloaded(self, file_name):
507 """Report file has already been fully downloaded."""
509 self.to_screen(u'[download] %s has already been downloaded' % file_name)
510 except (UnicodeEncodeError), err:
511 self.to_screen(u'[download] The file has already been downloaded')
513 def report_unable_to_resume(self):
514 """Report it was impossible to resume download."""
515 self.to_screen(u'[download] Unable to resume')
517 def report_finish(self):
518 """Report download finished."""
519 if self.params.get('noprogress', False):
520 self.to_screen(u'[download] Download completed')
524 def increment_downloads(self):
525 """Increment the ordinal that assigns a number to each file."""
526 self._num_downloads += 1
528 def prepare_filename(self, info_dict):
529 """Generate the output filename."""
531 template_dict = dict(info_dict)
532 template_dict['epoch'] = unicode(long(time.time()))
533 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
534 filename = self.params['outtmpl'] % template_dict
536 except (ValueError, KeyError), err:
537 self.trouble(u'ERROR: invalid system charset or erroneous output template')
540 def process_info(self, info_dict):
541 """Process a single dictionary returned by an InfoExtractor."""
542 filename = self.prepare_filename(info_dict)
543 # Do nothing else if in simulate mode
544 if self.params.get('simulate', False):
546 if self.params.get('forcetitle', False):
547 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
548 if self.params.get('forceurl', False):
549 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
550 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
551 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
552 if self.params.get('forcedescription', False) and 'description' in info_dict:
553 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
554 if self.params.get('forcefilename', False) and filename is not None:
555 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
561 if self.params.get('nooverwrites', False) and os.path.exists(filename):
562 self.to_stderr(u'WARNING: file exists and will be skipped')
566 self.pmkdir(filename)
567 except (OSError, IOError), err:
568 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
572 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
573 except (OSError, IOError), err:
574 raise UnavailableVideoError
575 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
576 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
578 except (ContentTooShortError, ), err:
579 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
584 self.post_process(filename, info_dict)
585 except (PostProcessingError), err:
586 self.trouble(u'ERROR: postprocessing: %s' % str(err))
589 def download(self, url_list):
590 """Download a given list of URLs."""
591 if len(url_list) > 1 and self.fixed_template():
592 raise SameFileError(self.params['outtmpl'])
595 suitable_found = False
597 # Go to next InfoExtractor if not suitable
598 if not ie.suitable(url):
601 # Suitable InfoExtractor found
602 suitable_found = True
604 # Extract information from URL and process it
607 # Suitable InfoExtractor had been found; go to next URL
610 if not suitable_found:
611 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
613 return self._download_retcode
615 def post_process(self, filename, ie_info):
616 """Run the postprocessing chain on the given file."""
618 info['filepath'] = filename
624 def _download_with_rtmpdump(self, filename, url, player_url):
625 self.report_destination(filename)
626 tmpfilename = self.temp_name(filename)
628 # Check for rtmpdump first
630 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
631 except (OSError, IOError):
632 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
635 # Download using rtmpdump. rtmpdump returns exit code 2 when
636 # the connection was interrumpted and resuming appears to be
637 # possible. This is part of rtmpdump's normal usage, AFAIK.
638 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
639 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
640 while retval == 2 or retval == 1:
641 prevsize = os.path.getsize(tmpfilename)
642 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
643 time.sleep(5.0) # This seems to be needed
644 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
645 cursize = os.path.getsize(tmpfilename)
646 if prevsize == cursize and retval == 1:
649 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
650 self.try_rename(tmpfilename, filename)
653 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
656 def _do_download(self, filename, url, player_url):
657 # Check file already present
658 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
659 self.report_file_already_downloaded(filename)
662 # Attempt to download using rtmpdump
663 if url.startswith('rtmp'):
664 return self._download_with_rtmpdump(filename, url, player_url)
666 tmpfilename = self.temp_name(filename)
670 # Do not include the Accept-Encoding header
671 headers = {'Youtubedl-no-compression': 'True'}
672 basic_request = urllib2.Request(url, None, headers)
673 request = urllib2.Request(url, None, headers)
675 # Establish possible resume length
676 if os.path.isfile(tmpfilename):
677 resume_len = os.path.getsize(tmpfilename)
681 # Request parameters in case of being able to resume
682 if self.params.get('continuedl', False) and resume_len != 0:
683 self.report_resuming_byte(resume_len)
684 request.add_header('Range','bytes=%d-' % resume_len)
688 retries = self.params.get('retries', 0)
689 while count <= retries:
690 # Establish connection
692 data = urllib2.urlopen(request)
694 except (urllib2.HTTPError, ), err:
695 if (err.code < 500 or err.code >= 600) and err.code != 416:
696 # Unexpected HTTP error
698 elif err.code == 416:
699 # Unable to resume (requested range not satisfiable)
701 # Open the connection again without the range header
702 data = urllib2.urlopen(basic_request)
703 content_length = data.info()['Content-Length']
704 except (urllib2.HTTPError, ), err:
705 if err.code < 500 or err.code >= 600:
708 # Examine the reported length
709 if (content_length is not None and
710 (resume_len - 100 < long(content_length) < resume_len + 100)):
711 # The file had already been fully downloaded.
712 # Explanation to the above condition: in issue #175 it was revealed that
713 # YouTube sometimes adds or removes a few bytes from the end of the file,
714 # changing the file size slightly and causing problems for some users. So
715 # I decided to implement a suggested change and consider the file
716 # completely downloaded if the file size differs less than 100 bytes from
717 # the one in the hard drive.
718 self.report_file_already_downloaded(filename)
719 self.try_rename(tmpfilename, filename)
722 # The length does not match, we start the download over
723 self.report_unable_to_resume()
729 self.report_retry(count, retries)
732 self.trouble(u'ERROR: giving up after %s retries' % retries)
735 data_len = data.info().get('Content-length', None)
736 if data_len is not None:
737 data_len = long(data_len) + resume_len
738 data_len_str = self.format_bytes(data_len)
739 byte_counter = 0 + resume_len
745 data_block = data.read(block_size)
747 if len(data_block) == 0:
749 byte_counter += len(data_block)
751 # Open file just in time
754 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
755 filename = self.undo_temp_name(tmpfilename)
756 self.report_destination(filename)
757 except (OSError, IOError), err:
758 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
761 stream.write(data_block)
762 except (IOError, OSError), err:
763 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
765 block_size = self.best_block_size(after - before, len(data_block))
768 percent_str = self.calc_percent(byte_counter, data_len)
769 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
770 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
771 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
774 self.slow_down(start, byte_counter - resume_len)
778 if data_len is not None and byte_counter != data_len:
779 raise ContentTooShortError(byte_counter, long(data_len))
780 self.try_rename(tmpfilename, filename)
782 # Update file modification time
783 if self.params.get('updatetime', True):
784 self.try_utime(filename, data.info().get('last-modified', None))
788 class InfoExtractor(object):
789 """Information Extractor class.
791 Information extractors are the classes that, given a URL, extract
792 information from the video (or videos) the URL refers to. This
793 information includes the real video URL, the video title and simplified
794 title, author and others. The information is stored in a dictionary
795 which is then passed to the FileDownloader. The FileDownloader
796 processes this information possibly downloading the video to the file
797 system, among other possible outcomes. The dictionaries must include
798 the following fields:
800 id: Video identifier.
801 url: Final video URL.
802 uploader: Nickname of the video uploader.
803 title: Literal title.
804 stitle: Simplified title.
805 ext: Video filename extension.
806 format: Video format.
807 player_url: SWF Player URL (may be None).
809 The following fields are optional. Their primary purpose is to allow
810 youtube-dl to serve as the backend for a video search function, such
811 as the one in youtube2mp3. They are only used when their respective
812 forced printing functions are called:
814 thumbnail: Full URL to a video thumbnail image.
815 description: One-line video description.
817 Subclasses of this one should re-define the _real_initialize() and
818 _real_extract() methods, as well as the suitable() static method.
819 Probably, they should also be instantiated and added to the main
826 def __init__(self, downloader=None):
827 """Constructor. Receives an optional downloader."""
829 self.set_downloader(downloader)
833 """Receives a URL and returns True if suitable for this IE."""
836 def initialize(self):
837 """Initializes an instance (authentication, etc)."""
839 self._real_initialize()
842 def extract(self, url):
843 """Extracts URL information and returns it in list of dicts."""
845 return self._real_extract(url)
847 def set_downloader(self, downloader):
848 """Sets the downloader for this IE."""
849 self._downloader = downloader
851 def _real_initialize(self):
852 """Real initialization process. Redefine in subclasses."""
855 def _real_extract(self, url):
856 """Real extraction process. Redefine in subclasses."""
859 class YoutubeIE(InfoExtractor):
860 """Information extractor for youtube.com."""
862 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
863 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
864 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
865 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
866 _NETRC_MACHINE = 'youtube'
867 # Listed in order of quality
868 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
869 _video_extensions = {
875 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
882 return (re.match(YoutubeIE._VALID_URL, url) is not None)
884 def report_lang(self):
885 """Report attempt to set language."""
886 self._downloader.to_screen(u'[youtube] Setting language')
888 def report_login(self):
889 """Report attempt to log in."""
890 self._downloader.to_screen(u'[youtube] Logging in')
892 def report_age_confirmation(self):
893 """Report attempt to confirm age."""
894 self._downloader.to_screen(u'[youtube] Confirming age')
896 def report_video_webpage_download(self, video_id):
897 """Report attempt to download video webpage."""
898 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
900 def report_video_info_webpage_download(self, video_id):
901 """Report attempt to download video info webpage."""
902 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
904 def report_information_extraction(self, video_id):
905 """Report attempt to extract video information."""
906 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
908 def report_unavailable_format(self, video_id, format):
909 """Report extracted video URL."""
910 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
912 def report_rtmp_download(self):
913 """Indicate the download will use the RTMP protocol."""
914 self._downloader.to_screen(u'[youtube] RTMP download detected')
916 def _real_initialize(self):
917 if self._downloader is None:
922 downloader_params = self._downloader.params
924 # Attempt to use provided username and password or .netrc data
925 if downloader_params.get('username', None) is not None:
926 username = downloader_params['username']
927 password = downloader_params['password']
928 elif downloader_params.get('usenetrc', False):
930 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
935 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
936 except (IOError, netrc.NetrcParseError), err:
937 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
941 request = urllib2.Request(self._LANG_URL)
944 urllib2.urlopen(request).read()
945 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
946 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
949 # No authentication to be performed
955 'current_form': 'loginForm',
957 'action_login': 'Log In',
958 'username': username,
959 'password': password,
961 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
964 login_results = urllib2.urlopen(request).read()
965 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
966 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
968 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
969 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
975 'action_confirm': 'Confirm',
977 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
979 self.report_age_confirmation()
980 age_results = urllib2.urlopen(request).read()
981 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
982 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
985 def _real_extract(self, url):
986 # Extract video id from URL
987 mobj = re.match(self._VALID_URL, url)
989 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
991 video_id = mobj.group(2)
994 self.report_video_webpage_download(video_id)
995 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
997 video_webpage = urllib2.urlopen(request).read()
998 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
999 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1002 # Attempt to extract SWF player URL
1003 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1004 if mobj is not None:
1005 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1010 self.report_video_info_webpage_download(video_id)
1011 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1012 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1013 % (video_id, el_type))
1014 request = urllib2.Request(video_info_url)
1016 video_info_webpage = urllib2.urlopen(request).read()
1017 video_info = parse_qs(video_info_webpage)
1018 if 'token' in video_info:
1020 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1021 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1023 if 'token' not in video_info:
1024 if 'reason' in video_info:
1025 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1027 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1030 # Start extracting information
1031 self.report_information_extraction(video_id)
1034 if 'author' not in video_info:
1035 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1037 video_uploader = urllib.unquote_plus(video_info['author'][0])
1040 if 'title' not in video_info:
1041 self._downloader.trouble(u'ERROR: unable to extract video title')
1043 video_title = urllib.unquote_plus(video_info['title'][0])
1044 video_title = video_title.decode('utf-8')
1045 video_title = sanitize_title(video_title)
1048 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1049 simple_title = simple_title.strip(ur'_')
1052 if 'thumbnail_url' not in video_info:
1053 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1054 video_thumbnail = ''
1055 else: # don't panic if we can't find it
1056 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1060 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1061 if mobj is not None:
1062 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1063 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1064 for expression in format_expressions:
1066 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1071 video_description = 'No description available.'
1072 if self._downloader.params.get('forcedescription', False):
1073 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1074 if mobj is not None:
1075 video_description = mobj.group(1)
1078 video_token = urllib.unquote_plus(video_info['token'][0])
1080 # Decide which formats to download
1081 req_format = self._downloader.params.get('format', None)
1083 if 'fmt_url_map' in video_info and len(video_info['fmt_url_map']) >= 1 and ',' in video_info['fmt_url_map'][0]:
1084 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1085 format_limit = self._downloader.params.get('format_limit', None)
1086 if format_limit is not None and format_limit in self._available_formats:
1087 format_list = self._available_formats[self._available_formats.index(format_limit):]
1089 format_list = self._available_formats
1090 existing_formats = [x for x in format_list if x in url_map]
1091 if len(existing_formats) == 0:
1092 self._downloader.trouble(u'ERROR: no known formats available for video')
1094 if req_format is None:
1095 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1096 elif req_format == '-1':
1097 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1100 if req_format not in url_map:
1101 self._downloader.trouble(u'ERROR: requested format not available')
1103 video_url_list = [(req_format, url_map[req_format])] # Specific format
1105 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1106 self.report_rtmp_download()
1107 video_url_list = [(None, video_info['conn'][0])]
1110 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1113 for format_param, video_real_url in video_url_list:
1114 # At this point we have a new video
1115 self._downloader.increment_downloads()
1118 video_extension = self._video_extensions.get(format_param, 'flv')
1120 # Find the video URL in fmt_url_map or conn paramters
1122 # Process video information
1123 self._downloader.process_info({
1124 'id': video_id.decode('utf-8'),
1125 'url': video_real_url.decode('utf-8'),
1126 'uploader': video_uploader.decode('utf-8'),
1127 'upload_date': upload_date,
1128 'title': video_title,
1129 'stitle': simple_title,
1130 'ext': video_extension.decode('utf-8'),
1131 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1132 'thumbnail': video_thumbnail.decode('utf-8'),
1133 'description': video_description.decode('utf-8'),
1134 'player_url': player_url,
1136 except UnavailableVideoError, err:
1137 self._downloader.trouble(u'\nERROR: unable to download video')
1140 class MetacafeIE(InfoExtractor):
1141 """Information Extractor for metacafe.com."""
1143 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1144 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1145 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1148 def __init__(self, youtube_ie, downloader=None):
1149 InfoExtractor.__init__(self, downloader)
1150 self._youtube_ie = youtube_ie
1154 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1156 def report_disclaimer(self):
1157 """Report disclaimer retrieval."""
1158 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1160 def report_age_confirmation(self):
1161 """Report attempt to confirm age."""
1162 self._downloader.to_screen(u'[metacafe] Confirming age')
1164 def report_download_webpage(self, video_id):
1165 """Report webpage download."""
1166 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1168 def report_extraction(self, video_id):
1169 """Report information extraction."""
1170 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1172 def _real_initialize(self):
1173 # Retrieve disclaimer
1174 request = urllib2.Request(self._DISCLAIMER)
1176 self.report_disclaimer()
1177 disclaimer = urllib2.urlopen(request).read()
1178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1179 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1185 'submit': "Continue - I'm over 18",
1187 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1189 self.report_age_confirmation()
1190 disclaimer = urllib2.urlopen(request).read()
1191 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1192 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1195 def _real_extract(self, url):
1196 # Extract id and simplified title from URL
1197 mobj = re.match(self._VALID_URL, url)
1199 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1202 video_id = mobj.group(1)
1204 # Check if video comes from YouTube
1205 mobj2 = re.match(r'^yt-(.*)$', video_id)
1206 if mobj2 is not None:
1207 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1210 # At this point we have a new video
1211 self._downloader.increment_downloads()
1213 simple_title = mobj.group(2).decode('utf-8')
1215 # Retrieve video webpage to extract further information
1216 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1218 self.report_download_webpage(video_id)
1219 webpage = urllib2.urlopen(request).read()
1220 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1221 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1224 # Extract URL, uploader and title from webpage
1225 self.report_extraction(video_id)
1226 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1227 if mobj is not None:
1228 mediaURL = urllib.unquote(mobj.group(1))
1229 video_extension = mediaURL[-3:]
1231 # Extract gdaKey if available
1232 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1234 video_url = mediaURL
1236 gdaKey = mobj.group(1)
1237 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1239 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1241 self._downloader.trouble(u'ERROR: unable to extract media URL')
1243 vardict = parse_qs(mobj.group(1))
1244 if 'mediaData' not in vardict:
1245 self._downloader.trouble(u'ERROR: unable to extract media URL')
1247 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1249 self._downloader.trouble(u'ERROR: unable to extract media URL')
1251 mediaURL = mobj.group(1).replace('\\/', '/')
1252 video_extension = mediaURL[-3:]
1253 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1255 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1257 self._downloader.trouble(u'ERROR: unable to extract title')
1259 video_title = mobj.group(1).decode('utf-8')
1260 video_title = sanitize_title(video_title)
1262 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1264 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1266 video_uploader = mobj.group(1)
1269 # Process video information
1270 self._downloader.process_info({
1271 'id': video_id.decode('utf-8'),
1272 'url': video_url.decode('utf-8'),
1273 'uploader': video_uploader.decode('utf-8'),
1274 'upload_date': u'NA',
1275 'title': video_title,
1276 'stitle': simple_title,
1277 'ext': video_extension.decode('utf-8'),
1281 except UnavailableVideoError:
1282 self._downloader.trouble(u'\nERROR: unable to download video')
1285 class DailymotionIE(InfoExtractor):
1286 """Information Extractor for Dailymotion"""
1288 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1290 def __init__(self, downloader=None):
1291 InfoExtractor.__init__(self, downloader)
1295 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1297 def report_download_webpage(self, video_id):
1298 """Report webpage download."""
1299 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1301 def report_extraction(self, video_id):
1302 """Report information extraction."""
1303 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1305 def _real_initialize(self):
1308 def _real_extract(self, url):
1309 # Extract id and simplified title from URL
1310 mobj = re.match(self._VALID_URL, url)
1312 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1315 # At this point we have a new video
1316 self._downloader.increment_downloads()
1317 video_id = mobj.group(1)
1319 simple_title = mobj.group(2).decode('utf-8')
1320 video_extension = 'flv'
1322 # Retrieve video webpage to extract further information
1323 request = urllib2.Request(url)
1325 self.report_download_webpage(video_id)
1326 webpage = urllib2.urlopen(request).read()
1327 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1328 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1331 # Extract URL, uploader and title from webpage
1332 self.report_extraction(video_id)
1333 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1335 self._downloader.trouble(u'ERROR: unable to extract media URL')
1337 mediaURL = urllib.unquote(mobj.group(1))
1339 # if needed add http://www.dailymotion.com/ if relative URL
1341 video_url = mediaURL
1343 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1344 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1346 self._downloader.trouble(u'ERROR: unable to extract title')
1348 video_title = mobj.group(1).decode('utf-8')
1349 video_title = sanitize_title(video_title)
1351 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1353 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1355 video_uploader = mobj.group(1)
1358 # Process video information
1359 self._downloader.process_info({
1360 'id': video_id.decode('utf-8'),
1361 'url': video_url.decode('utf-8'),
1362 'uploader': video_uploader.decode('utf-8'),
1363 'upload_date': u'NA',
1364 'title': video_title,
1365 'stitle': simple_title,
1366 'ext': video_extension.decode('utf-8'),
1370 except UnavailableVideoError:
1371 self._downloader.trouble(u'\nERROR: unable to download video')
1373 class GoogleIE(InfoExtractor):
1374 """Information extractor for video.google.com."""
1376 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1378 def __init__(self, downloader=None):
1379 InfoExtractor.__init__(self, downloader)
1383 return (re.match(GoogleIE._VALID_URL, url) is not None)
1385 def report_download_webpage(self, video_id):
1386 """Report webpage download."""
1387 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1389 def report_extraction(self, video_id):
1390 """Report information extraction."""
1391 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1393 def _real_initialize(self):
1396 def _real_extract(self, url):
1397 # Extract id from URL
1398 mobj = re.match(self._VALID_URL, url)
1400 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1403 # At this point we have a new video
1404 self._downloader.increment_downloads()
1405 video_id = mobj.group(1)
1407 video_extension = 'mp4'
1409 # Retrieve video webpage to extract further information
1410 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1412 self.report_download_webpage(video_id)
1413 webpage = urllib2.urlopen(request).read()
1414 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1415 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1418 # Extract URL, uploader, and title from webpage
1419 self.report_extraction(video_id)
1420 mobj = re.search(r"download_url:'([^']+)'", webpage)
1422 video_extension = 'flv'
1423 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1425 self._downloader.trouble(u'ERROR: unable to extract media URL')
1427 mediaURL = urllib.unquote(mobj.group(1))
1428 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1429 mediaURL = mediaURL.replace('\\x26', '\x26')
1431 video_url = mediaURL
1433 mobj = re.search(r'<title>(.*)</title>', webpage)
1435 self._downloader.trouble(u'ERROR: unable to extract title')
1437 video_title = mobj.group(1).decode('utf-8')
1438 video_title = sanitize_title(video_title)
1439 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1441 # Extract video description
1442 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1444 self._downloader.trouble(u'ERROR: unable to extract video description')
1446 video_description = mobj.group(1).decode('utf-8')
1447 if not video_description:
1448 video_description = 'No description available.'
1450 # Extract video thumbnail
1451 if self._downloader.params.get('forcethumbnail', False):
1452 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1454 webpage = urllib2.urlopen(request).read()
1455 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1456 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1458 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1460 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1462 video_thumbnail = mobj.group(1)
1463 else: # we need something to pass to process_info
1464 video_thumbnail = ''
1468 # Process video information
1469 self._downloader.process_info({
1470 'id': video_id.decode('utf-8'),
1471 'url': video_url.decode('utf-8'),
1473 'upload_date': u'NA',
1474 'title': video_title,
1475 'stitle': simple_title,
1476 'ext': video_extension.decode('utf-8'),
1480 except UnavailableVideoError:
1481 self._downloader.trouble(u'\nERROR: unable to download video')
1484 class PhotobucketIE(InfoExtractor):
1485 """Information extractor for photobucket.com."""
1487 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1489 def __init__(self, downloader=None):
1490 InfoExtractor.__init__(self, downloader)
1494 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1496 def report_download_webpage(self, video_id):
1497 """Report webpage download."""
1498 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1500 def report_extraction(self, video_id):
1501 """Report information extraction."""
1502 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1504 def _real_initialize(self):
1507 def _real_extract(self, url):
1508 # Extract id from URL
1509 mobj = re.match(self._VALID_URL, url)
1511 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1514 # At this point we have a new video
1515 self._downloader.increment_downloads()
1516 video_id = mobj.group(1)
1518 video_extension = 'flv'
1520 # Retrieve video webpage to extract further information
1521 request = urllib2.Request(url)
1523 self.report_download_webpage(video_id)
1524 webpage = urllib2.urlopen(request).read()
1525 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1526 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1529 # Extract URL, uploader, and title from webpage
1530 self.report_extraction(video_id)
1531 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1533 self._downloader.trouble(u'ERROR: unable to extract media URL')
1535 mediaURL = urllib.unquote(mobj.group(1))
1537 video_url = mediaURL
1539 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1541 self._downloader.trouble(u'ERROR: unable to extract title')
1543 video_title = mobj.group(1).decode('utf-8')
1544 video_title = sanitize_title(video_title)
1545 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1547 video_uploader = mobj.group(2).decode('utf-8')
1550 # Process video information
1551 self._downloader.process_info({
1552 'id': video_id.decode('utf-8'),
1553 'url': video_url.decode('utf-8'),
1554 'uploader': video_uploader,
1555 'upload_date': u'NA',
1556 'title': video_title,
1557 'stitle': simple_title,
1558 'ext': video_extension.decode('utf-8'),
1562 except UnavailableVideoError:
1563 self._downloader.trouble(u'\nERROR: unable to download video')
1566 class YahooIE(InfoExtractor):
1567 """Information extractor for video.yahoo.com."""
1569 # _VALID_URL matches all Yahoo! Video URLs
1570 # _VPAGE_URL matches only the extractable '/watch/' URLs
1571 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1572 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1574 def __init__(self, downloader=None):
1575 InfoExtractor.__init__(self, downloader)
1579 return (re.match(YahooIE._VALID_URL, url) is not None)
1581 def report_download_webpage(self, video_id):
1582 """Report webpage download."""
1583 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1585 def report_extraction(self, video_id):
1586 """Report information extraction."""
1587 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1589 def _real_initialize(self):
1592 def _real_extract(self, url, new_video=True):
1593 # Extract ID from URL
1594 mobj = re.match(self._VALID_URL, url)
1596 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1599 # At this point we have a new video
1600 self._downloader.increment_downloads()
1601 video_id = mobj.group(2)
1602 video_extension = 'flv'
1604 # Rewrite valid but non-extractable URLs as
1605 # extractable English language /watch/ URLs
1606 if re.match(self._VPAGE_URL, url) is None:
1607 request = urllib2.Request(url)
1609 webpage = urllib2.urlopen(request).read()
1610 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1611 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1614 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1616 self._downloader.trouble(u'ERROR: Unable to extract id field')
1618 yahoo_id = mobj.group(1)
1620 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1622 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1624 yahoo_vid = mobj.group(1)
1626 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1627 return self._real_extract(url, new_video=False)
1629 # Retrieve video webpage to extract further information
1630 request = urllib2.Request(url)
1632 self.report_download_webpage(video_id)
1633 webpage = urllib2.urlopen(request).read()
1634 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1635 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1638 # Extract uploader and title from webpage
1639 self.report_extraction(video_id)
1640 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1642 self._downloader.trouble(u'ERROR: unable to extract video title')
1644 video_title = mobj.group(1).decode('utf-8')
1645 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1647 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1649 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1651 video_uploader = mobj.group(1).decode('utf-8')
1653 # Extract video thumbnail
1654 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1656 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1658 video_thumbnail = mobj.group(1).decode('utf-8')
1660 # Extract video description
1661 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1663 self._downloader.trouble(u'ERROR: unable to extract video description')
1665 video_description = mobj.group(1).decode('utf-8')
1666 if not video_description: video_description = 'No description available.'
1668 # Extract video height and width
1669 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract video height')
1673 yv_video_height = mobj.group(1)
1675 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1677 self._downloader.trouble(u'ERROR: unable to extract video width')
1679 yv_video_width = mobj.group(1)
1681 # Retrieve video playlist to extract media URL
1682 # I'm not completely sure what all these options are, but we
1683 # seem to need most of them, otherwise the server sends a 401.
1684 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1685 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1686 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1687 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1688 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1690 self.report_download_webpage(video_id)
1691 webpage = urllib2.urlopen(request).read()
1692 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1693 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1696 # Extract media URL from playlist XML
1697 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1699 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1701 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1702 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1705 # Process video information
1706 self._downloader.process_info({
1707 'id': video_id.decode('utf-8'),
1709 'uploader': video_uploader,
1710 'upload_date': u'NA',
1711 'title': video_title,
1712 'stitle': simple_title,
1713 'ext': video_extension.decode('utf-8'),
1714 'thumbnail': video_thumbnail.decode('utf-8'),
1715 'description': video_description,
1716 'thumbnail': video_thumbnail,
1717 'description': video_description,
1720 except UnavailableVideoError:
1721 self._downloader.trouble(u'\nERROR: unable to download video')
1724 class GenericIE(InfoExtractor):
1725 """Generic last-resort information extractor."""
1727 def __init__(self, downloader=None):
1728 InfoExtractor.__init__(self, downloader)
1734 def report_download_webpage(self, video_id):
1735 """Report webpage download."""
1736 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1737 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1739 def report_extraction(self, video_id):
1740 """Report information extraction."""
1741 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1743 def _real_initialize(self):
1746 def _real_extract(self, url):
1747 # At this point we have a new video
1748 self._downloader.increment_downloads()
1750 video_id = url.split('/')[-1]
1751 request = urllib2.Request(url)
1753 self.report_download_webpage(video_id)
1754 webpage = urllib2.urlopen(request).read()
1755 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1756 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1758 except ValueError, err:
1759 # since this is the last-resort InfoExtractor, if
1760 # this error is thrown, it'll be thrown here
1761 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1764 self.report_extraction(video_id)
1765 # Start with something easy: JW Player in SWFObject
1766 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1768 # Broaden the search a little bit
1769 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1771 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1774 # It's possible that one of the regexes
1775 # matched, but returned an empty group:
1776 if mobj.group(1) is None:
1777 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1780 video_url = urllib.unquote(mobj.group(1))
1781 video_id = os.path.basename(video_url)
1783 # here's a fun little line of code for you:
1784 video_extension = os.path.splitext(video_id)[1][1:]
1785 video_id = os.path.splitext(video_id)[0]
1787 # it's tempting to parse this further, but you would
1788 # have to take into account all the variations like
1789 # Video Title - Site Name
1790 # Site Name | Video Title
1791 # Video Title - Tagline | Site Name
1792 # and so on and so forth; it's just not practical
1793 mobj = re.search(r'<title>(.*)</title>', webpage)
1795 self._downloader.trouble(u'ERROR: unable to extract title')
1797 video_title = mobj.group(1).decode('utf-8')
1798 video_title = sanitize_title(video_title)
1799 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1801 # video uploader is domain name
1802 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1804 self._downloader.trouble(u'ERROR: unable to extract title')
1806 video_uploader = mobj.group(1).decode('utf-8')
1809 # Process video information
1810 self._downloader.process_info({
1811 'id': video_id.decode('utf-8'),
1812 'url': video_url.decode('utf-8'),
1813 'uploader': video_uploader,
1814 'upload_date': u'NA',
1815 'title': video_title,
1816 'stitle': simple_title,
1817 'ext': video_extension.decode('utf-8'),
1821 except UnavailableVideoError, err:
1822 self._downloader.trouble(u'\nERROR: unable to download video')
1825 class YoutubeSearchIE(InfoExtractor):
1826 """Information Extractor for YouTube search queries."""
1827 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1828 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1829 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1830 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1832 _max_youtube_results = 1000
1834 def __init__(self, youtube_ie, downloader=None):
1835 InfoExtractor.__init__(self, downloader)
1836 self._youtube_ie = youtube_ie
1840 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1842 def report_download_page(self, query, pagenum):
1843 """Report attempt to download playlist page with given number."""
1844 query = query.decode(preferredencoding())
1845 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1847 def _real_initialize(self):
1848 self._youtube_ie.initialize()
1850 def _real_extract(self, query):
1851 mobj = re.match(self._VALID_QUERY, query)
1853 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1856 prefix, query = query.split(':')
1858 query = query.encode('utf-8')
1860 self._download_n_results(query, 1)
1862 elif prefix == 'all':
1863 self._download_n_results(query, self._max_youtube_results)
1869 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1871 elif n > self._max_youtube_results:
1872 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1873 n = self._max_youtube_results
1874 self._download_n_results(query, n)
1876 except ValueError: # parsing prefix as integer fails
1877 self._download_n_results(query, 1)
1880 def _download_n_results(self, query, n):
1881 """Downloads a specified number of results for a query"""
1884 already_seen = set()
1888 self.report_download_page(query, pagenum)
1889 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1890 request = urllib2.Request(result_url)
1892 page = urllib2.urlopen(request).read()
1893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1894 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1897 # Extract video identifiers
1898 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1899 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1900 if video_id not in already_seen:
1901 video_ids.append(video_id)
1902 already_seen.add(video_id)
1903 if len(video_ids) == n:
1904 # Specified n videos reached
1905 for id in video_ids:
1906 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1909 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1910 for id in video_ids:
1911 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1914 pagenum = pagenum + 1
1916 class GoogleSearchIE(InfoExtractor):
1917 """Information Extractor for Google Video search queries."""
1918 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1919 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1920 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1921 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1923 _max_google_results = 1000
1925 def __init__(self, google_ie, downloader=None):
1926 InfoExtractor.__init__(self, downloader)
1927 self._google_ie = google_ie
1931 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1933 def report_download_page(self, query, pagenum):
1934 """Report attempt to download playlist page with given number."""
1935 query = query.decode(preferredencoding())
1936 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1938 def _real_initialize(self):
1939 self._google_ie.initialize()
1941 def _real_extract(self, query):
1942 mobj = re.match(self._VALID_QUERY, query)
1944 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1947 prefix, query = query.split(':')
1949 query = query.encode('utf-8')
1951 self._download_n_results(query, 1)
1953 elif prefix == 'all':
1954 self._download_n_results(query, self._max_google_results)
1960 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1962 elif n > self._max_google_results:
1963 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1964 n = self._max_google_results
1965 self._download_n_results(query, n)
1967 except ValueError: # parsing prefix as integer fails
1968 self._download_n_results(query, 1)
1971 def _download_n_results(self, query, n):
1972 """Downloads a specified number of results for a query"""
1975 already_seen = set()
1979 self.report_download_page(query, pagenum)
1980 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1981 request = urllib2.Request(result_url)
1983 page = urllib2.urlopen(request).read()
1984 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1985 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1988 # Extract video identifiers
1989 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1990 video_id = mobj.group(1)
1991 if video_id not in already_seen:
1992 video_ids.append(video_id)
1993 already_seen.add(video_id)
1994 if len(video_ids) == n:
1995 # Specified n videos reached
1996 for id in video_ids:
1997 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2000 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2001 for id in video_ids:
2002 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2005 pagenum = pagenum + 1
2007 class YahooSearchIE(InfoExtractor):
2008 """Information Extractor for Yahoo! Video search queries."""
2009 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2010 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2011 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2012 _MORE_PAGES_INDICATOR = r'\s*Next'
2014 _max_yahoo_results = 1000
2016 def __init__(self, yahoo_ie, downloader=None):
2017 InfoExtractor.__init__(self, downloader)
2018 self._yahoo_ie = yahoo_ie
2022 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2024 def report_download_page(self, query, pagenum):
2025 """Report attempt to download playlist page with given number."""
2026 query = query.decode(preferredencoding())
2027 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2029 def _real_initialize(self):
2030 self._yahoo_ie.initialize()
2032 def _real_extract(self, query):
2033 mobj = re.match(self._VALID_QUERY, query)
2035 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2038 prefix, query = query.split(':')
2040 query = query.encode('utf-8')
2042 self._download_n_results(query, 1)
2044 elif prefix == 'all':
2045 self._download_n_results(query, self._max_yahoo_results)
2051 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2053 elif n > self._max_yahoo_results:
2054 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2055 n = self._max_yahoo_results
2056 self._download_n_results(query, n)
2058 except ValueError: # parsing prefix as integer fails
2059 self._download_n_results(query, 1)
2062 def _download_n_results(self, query, n):
2063 """Downloads a specified number of results for a query"""
2066 already_seen = set()
2070 self.report_download_page(query, pagenum)
2071 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2072 request = urllib2.Request(result_url)
2074 page = urllib2.urlopen(request).read()
2075 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2076 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2079 # Extract video identifiers
2080 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2081 video_id = mobj.group(1)
2082 if video_id not in already_seen:
2083 video_ids.append(video_id)
2084 already_seen.add(video_id)
2085 if len(video_ids) == n:
2086 # Specified n videos reached
2087 for id in video_ids:
2088 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2091 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2092 for id in video_ids:
2093 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2096 pagenum = pagenum + 1
2098 class YoutubePlaylistIE(InfoExtractor):
2099 """Information Extractor for YouTube playlists."""
2101 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2102 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2103 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2104 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2107 def __init__(self, youtube_ie, downloader=None):
2108 InfoExtractor.__init__(self, downloader)
2109 self._youtube_ie = youtube_ie
2113 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2115 def report_download_page(self, playlist_id, pagenum):
2116 """Report attempt to download playlist page with given number."""
2117 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2119 def _real_initialize(self):
2120 self._youtube_ie.initialize()
2122 def _real_extract(self, url):
2123 # Extract playlist id
2124 mobj = re.match(self._VALID_URL, url)
2126 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2130 if mobj.group(3) is not None:
2131 self._youtube_ie.extract(mobj.group(3))
2134 # Download playlist pages
2135 # prefix is 'p' as default for playlists but there are other types that need extra care
2136 playlist_prefix = mobj.group(1)
2137 if playlist_prefix == 'a':
2138 playlist_access = 'artist'
2140 playlist_prefix = 'p'
2141 playlist_access = 'view_play_list'
2142 playlist_id = mobj.group(2)
2147 self.report_download_page(playlist_id, pagenum)
2148 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2150 page = urllib2.urlopen(request).read()
2151 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2152 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2155 # Extract video identifiers
2157 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2158 if mobj.group(1) not in ids_in_page:
2159 ids_in_page.append(mobj.group(1))
2160 video_ids.extend(ids_in_page)
2162 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2164 pagenum = pagenum + 1
2166 playliststart = self._downloader.params.get('playliststart', 1) - 1
2167 playlistend = self._downloader.params.get('playlistend', -1)
2168 video_ids = video_ids[playliststart:playlistend]
2170 for id in video_ids:
2171 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2174 class YoutubeUserIE(InfoExtractor):
2175 """Information Extractor for YouTube users."""
2177 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2178 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2179 _GDATA_PAGE_SIZE = 50
2180 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2181 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2184 def __init__(self, youtube_ie, downloader=None):
2185 InfoExtractor.__init__(self, downloader)
2186 self._youtube_ie = youtube_ie
2190 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2192 def report_download_page(self, username, start_index):
2193 """Report attempt to download user page."""
2194 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2195 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2197 def _real_initialize(self):
2198 self._youtube_ie.initialize()
2200 def _real_extract(self, url):
2202 mobj = re.match(self._VALID_URL, url)
2204 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2207 username = mobj.group(1)
2209 # Download video ids using YouTube Data API. Result size per
2210 # query is limited (currently to 50 videos) so we need to query
2211 # page by page until there are no video ids - it means we got
2218 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2219 self.report_download_page(username, start_index)
2221 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2224 page = urllib2.urlopen(request).read()
2225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2226 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2229 # Extract video identifiers
2232 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2233 if mobj.group(1) not in ids_in_page:
2234 ids_in_page.append(mobj.group(1))
2236 video_ids.extend(ids_in_page)
2238 # A little optimization - if current page is not
2239 # "full", ie. does not contain PAGE_SIZE video ids then
2240 # we can assume that this page is the last one - there
2241 # are no more ids on further pages - no need to query
2244 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2249 all_ids_count = len(video_ids)
2250 playliststart = self._downloader.params.get('playliststart', 1) - 1
2251 playlistend = self._downloader.params.get('playlistend', -1)
2253 if playlistend == -1:
2254 video_ids = video_ids[playliststart:]
2256 video_ids = video_ids[playliststart:playlistend]
2258 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2259 (username, all_ids_count, len(video_ids)))
2261 for video_id in video_ids:
2262 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2265 class DepositFilesIE(InfoExtractor):
2266 """Information extractor for depositfiles.com"""
2268 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2270 def __init__(self, downloader=None):
2271 InfoExtractor.__init__(self, downloader)
2275 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2277 def report_download_webpage(self, file_id):
2278 """Report webpage download."""
2279 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2281 def report_extraction(self, file_id):
2282 """Report information extraction."""
2283 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2285 def _real_initialize(self):
2288 def _real_extract(self, url):
2289 # At this point we have a new file
2290 self._downloader.increment_downloads()
2292 file_id = url.split('/')[-1]
2293 # Rebuild url in english locale
2294 url = 'http://depositfiles.com/en/files/' + file_id
2296 # Retrieve file webpage with 'Free download' button pressed
2297 free_download_indication = { 'gateway_result' : '1' }
2298 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2300 self.report_download_webpage(file_id)
2301 webpage = urllib2.urlopen(request).read()
2302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2303 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2306 # Search for the real file URL
2307 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2308 if (mobj is None) or (mobj.group(1) is None):
2309 # Try to figure out reason of the error.
2310 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2311 if (mobj is not None) and (mobj.group(1) is not None):
2312 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2313 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2315 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2318 file_url = mobj.group(1)
2319 file_extension = os.path.splitext(file_url)[1][1:]
2321 # Search for file title
2322 mobj = re.search(r'<b title="(.*?)">', webpage)
2324 self._downloader.trouble(u'ERROR: unable to extract title')
2326 file_title = mobj.group(1).decode('utf-8')
2329 # Process file information
2330 self._downloader.process_info({
2331 'id': file_id.decode('utf-8'),
2332 'url': file_url.decode('utf-8'),
2334 'upload_date': u'NA',
2335 'title': file_title,
2336 'stitle': file_title,
2337 'ext': file_extension.decode('utf-8'),
2341 except UnavailableVideoError, err:
2342 self._downloader.trouble(u'ERROR: unable to download file')
2344 class FacebookIE(InfoExtractor):
2345 """Information Extractor for Facebook"""
2347 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2348 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2349 _NETRC_MACHINE = 'facebook'
2350 _available_formats = ['highqual', 'lowqual']
2351 _video_extensions = {
2356 def __init__(self, downloader=None):
2357 InfoExtractor.__init__(self, downloader)
2361 return (re.match(FacebookIE._VALID_URL, url) is not None)
2363 def _reporter(self, message):
2364 """Add header and report message."""
2365 self._downloader.to_screen(u'[facebook] %s' % message)
2367 def report_login(self):
2368 """Report attempt to log in."""
2369 self._reporter(u'Logging in')
2371 def report_video_webpage_download(self, video_id):
2372 """Report attempt to download video webpage."""
2373 self._reporter(u'%s: Downloading video webpage' % video_id)
2375 def report_information_extraction(self, video_id):
2376 """Report attempt to extract video information."""
2377 self._reporter(u'%s: Extracting video information' % video_id)
2379 def _parse_page(self, video_webpage):
2380 """Extract video information from page"""
2382 data = {'title': r'class="video_title datawrap">(.*?)</',
2383 'description': r'<div class="datawrap">(.*?)</div>',
2384 'owner': r'\("video_owner_name", "(.*?)"\)',
2385 'upload_date': r'data-date="(.*?)"',
2386 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2389 for piece in data.keys():
2390 mobj = re.search(data[piece], video_webpage)
2391 if mobj is not None:
2392 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2396 for fmt in self._available_formats:
2397 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2398 if mobj is not None:
2399 # URL is in a Javascript segment inside an escaped Unicode format within
2400 # the generally utf-8 page
2401 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2402 video_info['video_urls'] = video_urls
2406 def _real_initialize(self):
2407 if self._downloader is None:
2412 downloader_params = self._downloader.params
2414 # Attempt to use provided username and password or .netrc data
2415 if downloader_params.get('username', None) is not None:
2416 useremail = downloader_params['username']
2417 password = downloader_params['password']
2418 elif downloader_params.get('usenetrc', False):
2420 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2421 if info is not None:
2425 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2426 except (IOError, netrc.NetrcParseError), err:
2427 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2430 if useremail is None:
2439 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2442 login_results = urllib2.urlopen(request).read()
2443 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2444 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2447 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2450 def _real_extract(self, url):
2451 mobj = re.match(self._VALID_URL, url)
2453 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2455 video_id = mobj.group('ID')
2458 self.report_video_webpage_download(video_id)
2459 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2461 page = urllib2.urlopen(request)
2462 video_webpage = page.read()
2463 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2464 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2467 # Start extracting information
2468 self.report_information_extraction(video_id)
2470 # Extract information
2471 video_info = self._parse_page(video_webpage)
2474 if 'owner' not in video_info:
2475 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2477 video_uploader = video_info['owner']
2480 if 'title' not in video_info:
2481 self._downloader.trouble(u'ERROR: unable to extract video title')
2483 video_title = video_info['title']
2484 video_title = video_title.decode('utf-8')
2485 video_title = sanitize_title(video_title)
2488 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2489 simple_title = simple_title.strip(ur'_')
2492 if 'thumbnail' not in video_info:
2493 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2494 video_thumbnail = ''
2496 video_thumbnail = video_info['thumbnail']
2500 if 'upload_date' in video_info:
2501 upload_time = video_info['upload_date']
2502 timetuple = email.utils.parsedate_tz(upload_time)
2503 if timetuple is not None:
2505 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2510 video_description = 'No description available.'
2511 if (self._downloader.params.get('forcedescription', False) and
2512 'description' in video_info):
2513 video_description = video_info['description']
2515 url_map = video_info['video_urls']
2516 if len(url_map.keys()) > 0:
2517 # Decide which formats to download
2518 req_format = self._downloader.params.get('format', None)
2519 format_limit = self._downloader.params.get('format_limit', None)
2521 if format_limit is not None and format_limit in self._available_formats:
2522 format_list = self._available_formats[self._available_formats.index(format_limit):]
2524 format_list = self._available_formats
2525 existing_formats = [x for x in format_list if x in url_map]
2526 if len(existing_formats) == 0:
2527 self._downloader.trouble(u'ERROR: no known formats available for video')
2529 if req_format is None:
2530 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2531 elif req_format == '-1':
2532 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2535 if req_format not in url_map:
2536 self._downloader.trouble(u'ERROR: requested format not available')
2538 video_url_list = [(req_format, url_map[req_format])] # Specific format
2540 for format_param, video_real_url in video_url_list:
2542 # At this point we have a new video
2543 self._downloader.increment_downloads()
2546 video_extension = self._video_extensions.get(format_param, 'mp4')
2548 # Find the video URL in fmt_url_map or conn paramters
2550 # Process video information
2551 self._downloader.process_info({
2552 'id': video_id.decode('utf-8'),
2553 'url': video_real_url.decode('utf-8'),
2554 'uploader': video_uploader.decode('utf-8'),
2555 'upload_date': upload_date,
2556 'title': video_title,
2557 'stitle': simple_title,
2558 'ext': video_extension.decode('utf-8'),
2559 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2560 'thumbnail': video_thumbnail.decode('utf-8'),
2561 'description': video_description.decode('utf-8'),
2564 except UnavailableVideoError, err:
2565 self._downloader.trouble(u'\nERROR: unable to download video')
2567 class BlipTVIE(InfoExtractor):
2568 """Information extractor for blip.tv"""
2570 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip.tv/(.+)$'
2571 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2575 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2577 def report_download_webpage(self, file_id):
2578 """Report webpage download."""
2579 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.service_name, file_id))
2581 def report_extraction(self, file_id):
2582 """Report information extraction."""
2583 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.service_name, file_id))
2586 def service_name(self):
2589 def _simplify_title(self, title):
2590 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2591 res = res.strip(ur'_')
2594 def _real_extract(self, url):
2595 mobj = re.match(self._VALID_URL, url)
2597 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2600 json_url = url + ('&' if '?' in url else '?') + 'skin=json&version=2&no_wrap=1'
2601 request = urllib2.Request(json_url)
2603 json_code = urllib2.urlopen(request).read()
2604 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2605 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2608 json_data = json.loads(json_code)
2609 data = json_data['Post']
2611 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2612 video_url = data['media']['url']
2613 umobj = re.match(self._URL_EXT, video_url)
2615 raise ValueError('Can not determine filename extension')
2616 ext = umobj.group(1)
2619 'id': data['item_id'],
2621 'uploader': data['display_name'],
2622 'upload_date': upload_date,
2623 'title': data['title'],
2624 'stitle': self._simplify_title(data['title']),
2626 'format': data['media']['mimeType'],
2627 'thumbnail': data['thumbnailUrl'],
2628 'description': data['description'],
2629 'player_url': data['embedUrl']
2631 except (ValueError,KeyError), err:
2632 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % str(err))
2636 self._downloader.process_info(info)
2637 except UnavailableVideoError, err:
2638 self._downloader.trouble(u'\nERROR: unable to download video')
2641 class PostProcessor(object):
2642 """Post Processor class.
2644 PostProcessor objects can be added to downloaders with their
2645 add_post_processor() method. When the downloader has finished a
2646 successful download, it will take its internal chain of PostProcessors
2647 and start calling the run() method on each one of them, first with
2648 an initial argument and then with the returned value of the previous
2651 The chain will be stopped if one of them ever returns None or the end
2652 of the chain is reached.
2654 PostProcessor objects follow a "mutual registration" process similar
2655 to InfoExtractor objects.
2660 def __init__(self, downloader=None):
2661 self._downloader = downloader
2663 def set_downloader(self, downloader):
2664 """Sets the downloader for this PP."""
2665 self._downloader = downloader
2667 def run(self, information):
2668 """Run the PostProcessor.
2670 The "information" argument is a dictionary like the ones
2671 composed by InfoExtractors. The only difference is that this
2672 one has an extra field called "filepath" that points to the
2675 When this method returns None, the postprocessing chain is
2676 stopped. However, this method may return an information
2677 dictionary that will be passed to the next postprocessing
2678 object in the chain. It can be the one it received after
2679 changing some fields.
2681 In addition, this method may raise a PostProcessingError
2682 exception that will be taken into account by the downloader
2685 return information # by default, do nothing
2687 class FFmpegExtractAudioPP(PostProcessor):
2689 def __init__(self, downloader=None, preferredcodec=None):
2690 PostProcessor.__init__(self, downloader)
2691 if preferredcodec is None:
2692 preferredcodec = 'best'
2693 self._preferredcodec = preferredcodec
2696 def get_audio_codec(path):
2698 cmd = ['ffprobe', '-show_streams', '--', path]
2699 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2700 output = handle.communicate()[0]
2701 if handle.wait() != 0:
2703 except (IOError, OSError):
2706 for line in output.split('\n'):
2707 if line.startswith('codec_name='):
2708 audio_codec = line.split('=')[1].strip()
2709 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2714 def run_ffmpeg(path, out_path, codec, more_opts):
2716 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2717 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2719 except (IOError, OSError):
2722 def run(self, information):
2723 path = information['filepath']
2725 filecodec = self.get_audio_codec(path)
2726 if filecodec is None:
2727 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2731 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2732 if filecodec == 'aac' or filecodec == 'mp3':
2733 # Lossless if possible
2735 extension = filecodec
2736 if filecodec == 'aac':
2737 more_opts = ['-f', 'adts']
2740 acodec = 'libmp3lame'
2742 more_opts = ['-ab', '128k']
2744 # We convert the audio (lossy)
2745 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2746 extension = self._preferredcodec
2747 more_opts = ['-ab', '128k']
2748 if self._preferredcodec == 'aac':
2749 more_opts += ['-f', 'adts']
2751 (prefix, ext) = os.path.splitext(path)
2752 new_path = prefix + '.' + extension
2753 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2754 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2757 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2762 except (IOError, OSError):
2763 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2766 information['filepath'] = new_path
2769 ### MAIN PROGRAM ###
2770 if __name__ == '__main__':
2772 # Modules needed only when running the main program
2776 # Function to update the program file with the latest version from the repository.
2777 def update_self(downloader, filename):
2778 # Note: downloader only used for options
2779 if not os.access(filename, os.W_OK):
2780 sys.exit('ERROR: no write permissions on %s' % filename)
2782 downloader.to_screen('Updating to latest stable version...')
2784 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2785 latest_version = urllib.urlopen(latest_url).read().strip()
2786 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2787 newcontent = urllib.urlopen(prog_url).read()
2788 except (IOError, OSError), err:
2789 sys.exit('ERROR: unable to download latest version')
2791 stream = open(filename, 'w')
2792 stream.write(newcontent)
2794 except (IOError, OSError), err:
2795 sys.exit('ERROR: unable to overwrite current version')
2796 downloader.to_screen('Updated to version %s' % latest_version)
2798 # Parse command line
2799 parser = optparse.OptionParser(
2800 usage='Usage: %prog [options] url...',
2801 version='2011.03.29',
2802 conflict_handler='resolve',
2805 parser.add_option('-h', '--help',
2806 action='help', help='print this help text and exit')
2807 parser.add_option('-v', '--version',
2808 action='version', help='print program version and exit')
2809 parser.add_option('-U', '--update',
2810 action='store_true', dest='update_self', help='update this program to latest stable version')
2811 parser.add_option('-i', '--ignore-errors',
2812 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2813 parser.add_option('-r', '--rate-limit',
2814 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2815 parser.add_option('-R', '--retries',
2816 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2817 parser.add_option('--playlist-start',
2818 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2819 parser.add_option('--playlist-end',
2820 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2821 parser.add_option('--dump-user-agent',
2822 action='store_true', dest='dump_user_agent',
2823 help='display the current browser identification', default=False)
2825 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2826 authentication.add_option('-u', '--username',
2827 dest='username', metavar='USERNAME', help='account username')
2828 authentication.add_option('-p', '--password',
2829 dest='password', metavar='PASSWORD', help='account password')
2830 authentication.add_option('-n', '--netrc',
2831 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2832 parser.add_option_group(authentication)
2834 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2835 video_format.add_option('-f', '--format',
2836 action='store', dest='format', metavar='FORMAT', help='video format code')
2837 video_format.add_option('--all-formats',
2838 action='store_const', dest='format', help='download all available video formats', const='-1')
2839 video_format.add_option('--max-quality',
2840 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2841 parser.add_option_group(video_format)
2843 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2844 verbosity.add_option('-q', '--quiet',
2845 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2846 verbosity.add_option('-s', '--simulate',
2847 action='store_true', dest='simulate', help='do not download video', default=False)
2848 verbosity.add_option('-g', '--get-url',
2849 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2850 verbosity.add_option('-e', '--get-title',
2851 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2852 verbosity.add_option('--get-thumbnail',
2853 action='store_true', dest='getthumbnail',
2854 help='simulate, quiet but print thumbnail URL', default=False)
2855 verbosity.add_option('--get-description',
2856 action='store_true', dest='getdescription',
2857 help='simulate, quiet but print video description', default=False)
2858 verbosity.add_option('--get-filename',
2859 action='store_true', dest='getfilename',
2860 help='simulate, quiet but print output filename', default=False)
2861 verbosity.add_option('--no-progress',
2862 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2863 verbosity.add_option('--console-title',
2864 action='store_true', dest='consoletitle',
2865 help='display progress in console titlebar', default=False)
2866 parser.add_option_group(verbosity)
2868 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2869 filesystem.add_option('-t', '--title',
2870 action='store_true', dest='usetitle', help='use title in file name', default=False)
2871 filesystem.add_option('-l', '--literal',
2872 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2873 filesystem.add_option('-A', '--auto-number',
2874 action='store_true', dest='autonumber',
2875 help='number downloaded files starting from 00000', default=False)
2876 filesystem.add_option('-o', '--output',
2877 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2878 filesystem.add_option('-a', '--batch-file',
2879 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2880 filesystem.add_option('-w', '--no-overwrites',
2881 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2882 filesystem.add_option('-c', '--continue',
2883 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2884 filesystem.add_option('--cookies',
2885 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2886 filesystem.add_option('--no-part',
2887 action='store_true', dest='nopart', help='do not use .part files', default=False)
2888 filesystem.add_option('--no-mtime',
2889 action='store_false', dest='updatetime',
2890 help='do not use the Last-modified header to set the file modification time', default=True)
2891 parser.add_option_group(filesystem)
2893 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2894 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2895 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2896 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2897 help='"best", "aac" or "mp3"; best by default')
2898 parser.add_option_group(postproc)
2900 (opts, args) = parser.parse_args()
2902 # Open appropriate CookieJar
2903 if opts.cookiefile is None:
2904 jar = cookielib.CookieJar()
2907 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2908 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2910 except (IOError, OSError), err:
2911 sys.exit(u'ERROR: unable to open cookie file')
2914 if opts.dump_user_agent:
2915 print std_headers['User-Agent']
2918 # General configuration
2919 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2920 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2921 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2923 # Batch file verification
2925 if opts.batchfile is not None:
2927 if opts.batchfile == '-':
2930 batchfd = open(opts.batchfile, 'r')
2931 batchurls = batchfd.readlines()
2932 batchurls = [x.strip() for x in batchurls]
2933 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2935 sys.exit(u'ERROR: batch file could not be read')
2936 all_urls = batchurls + args
2938 # Conflicting, missing and erroneous options
2939 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2940 parser.error(u'using .netrc conflicts with giving username/password')
2941 if opts.password is not None and opts.username is None:
2942 parser.error(u'account username missing')
2943 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2944 parser.error(u'using output template conflicts with using title, literal title or auto number')
2945 if opts.usetitle and opts.useliteral:
2946 parser.error(u'using title conflicts with using literal title')
2947 if opts.username is not None and opts.password is None:
2948 opts.password = getpass.getpass(u'Type account password and press return:')
2949 if opts.ratelimit is not None:
2950 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2951 if numeric_limit is None:
2952 parser.error(u'invalid rate limit specified')
2953 opts.ratelimit = numeric_limit
2954 if opts.retries is not None:
2956 opts.retries = long(opts.retries)
2957 except (TypeError, ValueError), err:
2958 parser.error(u'invalid retry count specified')
2960 opts.playliststart = long(opts.playliststart)
2961 if opts.playliststart <= 0:
2963 except (TypeError, ValueError), err:
2964 parser.error(u'invalid playlist start number specified')
2966 opts.playlistend = long(opts.playlistend)
2967 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2969 except (TypeError, ValueError), err:
2970 parser.error(u'invalid playlist end number specified')
2971 if opts.extractaudio:
2972 if opts.audioformat not in ['best', 'aac', 'mp3']:
2973 parser.error(u'invalid audio format specified')
2975 # Information extractors
2976 youtube_ie = YoutubeIE()
2977 metacafe_ie = MetacafeIE(youtube_ie)
2978 dailymotion_ie = DailymotionIE()
2979 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2980 youtube_user_ie = YoutubeUserIE(youtube_ie)
2981 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2982 google_ie = GoogleIE()
2983 google_search_ie = GoogleSearchIE(google_ie)
2984 photobucket_ie = PhotobucketIE()
2985 yahoo_ie = YahooIE()
2986 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2987 deposit_files_ie = DepositFilesIE()
2988 facebook_ie = FacebookIE()
2989 bliptv_ie = BlipTVIE()
2990 generic_ie = GenericIE()
2993 fd = FileDownloader({
2994 'usenetrc': opts.usenetrc,
2995 'username': opts.username,
2996 'password': opts.password,
2997 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2998 'forceurl': opts.geturl,
2999 'forcetitle': opts.gettitle,
3000 'forcethumbnail': opts.getthumbnail,
3001 'forcedescription': opts.getdescription,
3002 'forcefilename': opts.getfilename,
3003 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
3004 'format': opts.format,
3005 'format_limit': opts.format_limit,
3006 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3007 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3008 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3009 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3010 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3011 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3012 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3013 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3014 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3015 or u'%(id)s.%(ext)s'),
3016 'ignoreerrors': opts.ignoreerrors,
3017 'ratelimit': opts.ratelimit,
3018 'nooverwrites': opts.nooverwrites,
3019 'retries': opts.retries,
3020 'continuedl': opts.continue_dl,
3021 'noprogress': opts.noprogress,
3022 'playliststart': opts.playliststart,
3023 'playlistend': opts.playlistend,
3024 'logtostderr': opts.outtmpl == '-',
3025 'consoletitle': opts.consoletitle,
3026 'nopart': opts.nopart,
3027 'updatetime': opts.updatetime,
3029 fd.add_info_extractor(youtube_search_ie)
3030 fd.add_info_extractor(youtube_pl_ie)
3031 fd.add_info_extractor(youtube_user_ie)
3032 fd.add_info_extractor(metacafe_ie)
3033 fd.add_info_extractor(dailymotion_ie)
3034 fd.add_info_extractor(youtube_ie)
3035 fd.add_info_extractor(google_ie)
3036 fd.add_info_extractor(google_search_ie)
3037 fd.add_info_extractor(photobucket_ie)
3038 fd.add_info_extractor(yahoo_ie)
3039 fd.add_info_extractor(yahoo_search_ie)
3040 fd.add_info_extractor(deposit_files_ie)
3041 fd.add_info_extractor(facebook_ie)
3042 fd.add_info_extractor(bliptv_ie)
3044 # This must come last since it's the
3045 # fallback if none of the others work
3046 fd.add_info_extractor(generic_ie)
3049 if opts.extractaudio:
3050 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3053 if opts.update_self:
3054 update_self(fd, sys.argv[0])
3057 if len(all_urls) < 1:
3058 if not opts.update_self:
3059 parser.error(u'you must provide at least one URL')
3062 retcode = fd.download(all_urls)
3064 # Dump cookie jar if requested
3065 if opts.cookiefile is not None:
3068 except (IOError, OSError), err:
3069 sys.exit(u'ERROR: unable to save cookie jar')
3073 except DownloadError:
3075 except SameFileError:
3076 sys.exit(u'ERROR: fixed output name but more than one file to download')
3077 except KeyboardInterrupt:
3078 sys.exit(u'\nERROR: Interrupted by user')