2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # Author: Paweł Paprota
9 # Author: Gergely Imreh
10 # License: Public domain code
34 # parse_qs was moved from the cgi module to the urlparse module recently.
36 from urlparse import parse_qs
38 from cgi import parse_qs
41 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
42 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
43 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 'Accept-Encoding': 'gzip, deflate',
45 'Accept-Language': 'en-us,en;q=0.5',
48 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
51 def preferredencoding():
52 """Get preferred encoding.
54 Returns the best encoding scheme for the system, based on
55 locale.getpreferredencoding() and some further tweaks.
57 def yield_preferredencoding():
59 pref = locale.getpreferredencoding()
65 return yield_preferredencoding().next()
68 def htmlentity_transform(matchobj):
69 """Transforms an HTML entity to a Unicode character.
71 This function receives a match object and is intended to be used with
72 the re.sub() function.
74 entity = matchobj.group(1)
76 # Known non-numeric HTML entity
77 if entity in htmlentitydefs.name2codepoint:
78 return unichr(htmlentitydefs.name2codepoint[entity])
81 mobj = re.match(ur'(?u)#(x?\d+)', entity)
83 numstr = mobj.group(1)
84 if numstr.startswith(u'x'):
86 numstr = u'0%s' % numstr
89 return unichr(long(numstr, base))
91 # Unknown entity in name, return its literal representation
92 return (u'&%s;' % entity)
95 def sanitize_title(utitle):
96 """Sanitizes a video title so it could be used as part of a filename."""
97 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
98 return utitle.replace(unicode(os.sep), u'%')
101 def sanitize_open(filename, open_mode):
102 """Try to open the given filename, and slightly tweak it if this fails.
104 Attempts to open the given filename. If this fails, it tries to change
105 the filename slightly, step by step, until it's either able to open it
106 or it fails and raises a final exception, like the standard open()
109 It returns the tuple (stream, definitive_file_name).
113 if sys.platform == 'win32':
115 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
116 return (sys.stdout, filename)
117 stream = open(filename, open_mode)
118 return (stream, filename)
119 except (IOError, OSError), err:
120 # In case of error, try to remove win32 forbidden chars
121 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
123 # An exception here should be caught in the caller
124 stream = open(filename, open_mode)
125 return (stream, filename)
128 def timeconvert(timestr):
129 """Convert RFC 2822 defined time string into system timestamp"""
131 timetuple = email.utils.parsedate_tz(timestr)
132 if timetuple is not None:
133 timestamp = email.utils.mktime_tz(timetuple)
137 class DownloadError(Exception):
138 """Download Error exception.
140 This exception may be thrown by FileDownloader objects if they are not
141 configured to continue on errors. They will contain the appropriate
147 class SameFileError(Exception):
148 """Same File exception.
150 This exception will be thrown by FileDownloader objects if they detect
151 multiple files would have to be downloaded to the same file on disk.
156 class PostProcessingError(Exception):
157 """Post Processing exception.
159 This exception may be raised by PostProcessor's .run() method to
160 indicate an error in the postprocessing task.
165 class UnavailableVideoError(Exception):
166 """Unavailable Format exception.
168 This exception will be thrown when a video is requested
169 in a format that is not available for that video.
174 class ContentTooShortError(Exception):
175 """Content Too Short exception.
177 This exception may be raised by FileDownloader objects when a file they
178 download is too small for what the server announced first, indicating
179 the connection was probably interrupted.
185 def __init__(self, downloaded, expected):
186 self.downloaded = downloaded
187 self.expected = expected
190 class YoutubeDLHandler(urllib2.HTTPHandler):
191 """Handler for HTTP requests and responses.
193 This class, when installed with an OpenerDirector, automatically adds
194 the standard headers to every HTTP request and handles gzipped and
195 deflated responses from web servers. If compression is to be avoided in
196 a particular request, the original request in the program code only has
197 to include the HTTP header "Youtubedl-No-Compression", which will be
198 removed before making the real request.
200 Part of this code was copied from:
202 http://techknack.net/python-urllib2-handlers/
204 Andrew Rowls, the author of that code, agreed to release it to the
211 return zlib.decompress(data, -zlib.MAX_WBITS)
213 return zlib.decompress(data)
216 def addinfourl_wrapper(stream, headers, url, code):
217 if hasattr(urllib2.addinfourl, 'getcode'):
218 return urllib2.addinfourl(stream, headers, url, code)
219 ret = urllib2.addinfourl(stream, headers, url)
223 def http_request(self, req):
224 for h in std_headers:
227 req.add_header(h, std_headers[h])
228 if 'Youtubedl-no-compression' in req.headers:
229 if 'Accept-encoding' in req.headers:
230 del req.headers['Accept-encoding']
231 del req.headers['Youtubedl-no-compression']
234 def http_response(self, req, resp):
237 if resp.headers.get('Content-encoding', '') == 'gzip':
238 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
239 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
240 resp.msg = old_resp.msg
242 if resp.headers.get('Content-encoding', '') == 'deflate':
243 gz = StringIO.StringIO(self.deflate(resp.read()))
244 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
245 resp.msg = old_resp.msg
249 class FileDownloader(object):
250 """File Downloader class.
252 File downloader objects are the ones responsible of downloading the
253 actual video file and writing it to disk if the user has requested
254 it, among some other tasks. In most cases there should be one per
255 program. As, given a video URL, the downloader doesn't know how to
256 extract all the needed information, task that InfoExtractors do, it
257 has to pass the URL to one of them.
259 For this, file downloader objects have a method that allows
260 InfoExtractors to be registered in a given order. When it is passed
261 a URL, the file downloader handles it to the first InfoExtractor it
262 finds that reports being able to handle it. The InfoExtractor extracts
263 all the information about the video or videos the URL refers to, and
264 asks the FileDownloader to process the video information, possibly
265 downloading the video.
267 File downloaders accept a lot of parameters. In order not to saturate
268 the object constructor with arguments, it receives a dictionary of
269 options instead. These options are available through the params
270 attribute for the InfoExtractors to use. The FileDownloader also
271 registers itself as the downloader in charge for the InfoExtractors
272 that are added to it, so this is a "mutual registration".
276 username: Username for authentication purposes.
277 password: Password for authentication purposes.
278 usenetrc: Use netrc for authentication instead.
279 quiet: Do not print messages to stdout.
280 forceurl: Force printing final URL.
281 forcetitle: Force printing title.
282 forcethumbnail: Force printing thumbnail URL.
283 forcedescription: Force printing description.
284 forcefilename: Force printing final filename.
285 simulate: Do not download the video files.
286 format: Video format code.
287 format_limit: Highest quality format to try.
288 outtmpl: Template for output names.
289 ignoreerrors: Do not stop on download errors.
290 ratelimit: Download speed limit, in bytes/sec.
291 nooverwrites: Prevent overwriting files.
292 retries: Number of times to retry for HTTP error 5xx
293 continuedl: Try to continue downloads if possible.
294 noprogress: Do not print the progress bar.
295 playliststart: Playlist item to start at.
296 playlistend: Playlist item to end at.
297 logtostderr: Log messages to stderr instead of stdout.
298 consoletitle: Display progress in console window's titlebar.
299 nopart: Do not use temporary .part files.
300 updatetime: Use the Last-modified header to set output file timestamps.
306 _download_retcode = None
307 _num_downloads = None
310 def __init__(self, params):
311 """Create a FileDownloader object with the given options."""
314 self._download_retcode = 0
315 self._num_downloads = 0
316 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
320 def pmkdir(filename):
321 """Create directory components in filename. Similar to Unix "mkdir -p"."""
322 components = filename.split(os.sep)
323 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
324 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
325 for dir in aggregate:
326 if not os.path.exists(dir):
330 def format_bytes(bytes):
333 if type(bytes) is str:
338 exponent = long(math.log(bytes, 1024.0))
339 suffix = 'bkMGTPEZY'[exponent]
340 converted = float(bytes) / float(1024 ** exponent)
341 return '%.2f%s' % (converted, suffix)
344 def calc_percent(byte_counter, data_len):
347 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
350 def calc_eta(start, now, total, current):
354 if current == 0 or dif < 0.001: # One millisecond
356 rate = float(current) / dif
357 eta = long((float(total) - float(current)) / rate)
358 (eta_mins, eta_secs) = divmod(eta, 60)
361 return '%02d:%02d' % (eta_mins, eta_secs)
364 def calc_speed(start, now, bytes):
366 if bytes == 0 or dif < 0.001: # One millisecond
367 return '%10s' % '---b/s'
368 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
371 def best_block_size(elapsed_time, bytes):
372 new_min = max(bytes / 2.0, 1.0)
373 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
374 if elapsed_time < 0.001:
376 rate = bytes / elapsed_time
384 def parse_bytes(bytestr):
385 """Parse a string indicating a byte quantity into a long integer."""
386 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
389 number = float(matchobj.group(1))
390 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
391 return long(round(number * multiplier))
393 def add_info_extractor(self, ie):
394 """Add an InfoExtractor object to the end of the list."""
396 ie.set_downloader(self)
398 def add_post_processor(self, pp):
399 """Add a PostProcessor object to the end of the chain."""
401 pp.set_downloader(self)
403 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
404 """Print message to stdout if not in quiet mode."""
406 if not self.params.get('quiet', False):
407 terminator = [u'\n', u''][skip_eol]
408 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
409 self._screen_file.flush()
410 except (UnicodeEncodeError), err:
411 if not ignore_encoding_errors:
414 def to_stderr(self, message):
415 """Print message to stderr."""
416 print >>sys.stderr, message.encode(preferredencoding())
418 def to_cons_title(self, message):
419 """Set console/terminal window title to message."""
420 if not self.params.get('consoletitle', False):
422 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
423 # c_wchar_p() might not be necessary if `message` is
424 # already of type unicode()
425 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
426 elif 'TERM' in os.environ:
427 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
429 def fixed_template(self):
430 """Checks if the output template is fixed."""
431 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
433 def trouble(self, message=None):
434 """Determine action to take when a download problem appears.
436 Depending on if the downloader has been configured to ignore
437 download errors or not, this method may throw an exception or
438 not when errors are found, after printing the message.
440 if message is not None:
441 self.to_stderr(message)
442 if not self.params.get('ignoreerrors', False):
443 raise DownloadError(message)
444 self._download_retcode = 1
446 def slow_down(self, start_time, byte_counter):
447 """Sleep if the download speed is over the rate limit."""
448 rate_limit = self.params.get('ratelimit', None)
449 if rate_limit is None or byte_counter == 0:
452 elapsed = now - start_time
455 speed = float(byte_counter) / elapsed
456 if speed > rate_limit:
457 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
459 def temp_name(self, filename):
460 """Returns a temporary filename for the given filename."""
461 if self.params.get('nopart', False) or filename == u'-' or \
462 (os.path.exists(filename) and not os.path.isfile(filename)):
464 return filename + u'.part'
466 def undo_temp_name(self, filename):
467 if filename.endswith(u'.part'):
468 return filename[:-len(u'.part')]
471 def try_rename(self, old_filename, new_filename):
473 if old_filename == new_filename:
475 os.rename(old_filename, new_filename)
476 except (IOError, OSError), err:
477 self.trouble(u'ERROR: unable to rename file')
479 def try_utime(self, filename, last_modified_hdr):
480 """Try to set the last-modified time of the given file."""
481 if last_modified_hdr is None:
483 if not os.path.isfile(filename):
485 timestr = last_modified_hdr
488 filetime = timeconvert(timestr)
492 os.utime(filename, (time.time(), filetime))
496 def report_destination(self, filename):
497 """Report destination filename."""
498 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
500 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
501 """Report download progress."""
502 if self.params.get('noprogress', False):
504 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
505 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
506 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
507 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
509 def report_resuming_byte(self, resume_len):
510 """Report attempt to resume at given byte."""
511 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
513 def report_retry(self, count, retries):
514 """Report retry in case of HTTP error 5xx"""
515 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
517 def report_file_already_downloaded(self, file_name):
518 """Report file has already been fully downloaded."""
520 self.to_screen(u'[download] %s has already been downloaded' % file_name)
521 except (UnicodeEncodeError), err:
522 self.to_screen(u'[download] The file has already been downloaded')
524 def report_unable_to_resume(self):
525 """Report it was impossible to resume download."""
526 self.to_screen(u'[download] Unable to resume')
528 def report_finish(self):
529 """Report download finished."""
530 if self.params.get('noprogress', False):
531 self.to_screen(u'[download] Download completed')
535 def increment_downloads(self):
536 """Increment the ordinal that assigns a number to each file."""
537 self._num_downloads += 1
539 def prepare_filename(self, info_dict):
540 """Generate the output filename."""
542 template_dict = dict(info_dict)
543 template_dict['epoch'] = unicode(long(time.time()))
544 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
545 filename = self.params['outtmpl'] % template_dict
547 except (ValueError, KeyError), err:
548 self.trouble(u'ERROR: invalid system charset or erroneous output template')
551 def process_info(self, info_dict):
552 """Process a single dictionary returned by an InfoExtractor."""
553 filename = self.prepare_filename(info_dict)
554 # Do nothing else if in simulate mode
555 if self.params.get('simulate', False):
557 if self.params.get('forcetitle', False):
558 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
559 if self.params.get('forceurl', False):
560 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
561 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
562 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
563 if self.params.get('forcedescription', False) and 'description' in info_dict:
564 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
565 if self.params.get('forcefilename', False) and filename is not None:
566 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
572 if self.params.get('nooverwrites', False) and os.path.exists(filename):
573 self.to_stderr(u'WARNING: file exists and will be skipped')
577 self.pmkdir(filename)
578 except (OSError, IOError), err:
579 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
583 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
584 except (OSError, IOError), err:
585 raise UnavailableVideoError
586 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
587 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
589 except (ContentTooShortError, ), err:
590 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
595 self.post_process(filename, info_dict)
596 except (PostProcessingError), err:
597 self.trouble(u'ERROR: postprocessing: %s' % str(err))
600 def download(self, url_list):
601 """Download a given list of URLs."""
602 if len(url_list) > 1 and self.fixed_template():
603 raise SameFileError(self.params['outtmpl'])
606 suitable_found = False
608 # Go to next InfoExtractor if not suitable
609 if not ie.suitable(url):
612 # Suitable InfoExtractor found
613 suitable_found = True
615 # Extract information from URL and process it
618 # Suitable InfoExtractor had been found; go to next URL
621 if not suitable_found:
622 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
624 return self._download_retcode
626 def post_process(self, filename, ie_info):
627 """Run the postprocessing chain on the given file."""
629 info['filepath'] = filename
635 def _download_with_rtmpdump(self, filename, url, player_url):
636 self.report_destination(filename)
637 tmpfilename = self.temp_name(filename)
639 # Check for rtmpdump first
641 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
642 except (OSError, IOError):
643 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
646 # Download using rtmpdump. rtmpdump returns exit code 2 when
647 # the connection was interrumpted and resuming appears to be
648 # possible. This is part of rtmpdump's normal usage, AFAIK.
649 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
650 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
651 while retval == 2 or retval == 1:
652 prevsize = os.path.getsize(tmpfilename)
653 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
654 time.sleep(5.0) # This seems to be needed
655 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
656 cursize = os.path.getsize(tmpfilename)
657 if prevsize == cursize and retval == 1:
660 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
661 self.try_rename(tmpfilename, filename)
664 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
667 def _do_download(self, filename, url, player_url):
668 # Check file already present
669 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
670 self.report_file_already_downloaded(filename)
673 # Attempt to download using rtmpdump
674 if url.startswith('rtmp'):
675 return self._download_with_rtmpdump(filename, url, player_url)
677 tmpfilename = self.temp_name(filename)
681 # Do not include the Accept-Encoding header
682 headers = {'Youtubedl-no-compression': 'True'}
683 basic_request = urllib2.Request(url, None, headers)
684 request = urllib2.Request(url, None, headers)
686 # Establish possible resume length
687 if os.path.isfile(tmpfilename):
688 resume_len = os.path.getsize(tmpfilename)
692 # Request parameters in case of being able to resume
693 if self.params.get('continuedl', False) and resume_len != 0:
694 self.report_resuming_byte(resume_len)
695 request.add_header('Range', 'bytes=%d-' % resume_len)
699 retries = self.params.get('retries', 0)
700 while count <= retries:
701 # Establish connection
703 data = urllib2.urlopen(request)
705 except (urllib2.HTTPError, ), err:
706 if (err.code < 500 or err.code >= 600) and err.code != 416:
707 # Unexpected HTTP error
709 elif err.code == 416:
710 # Unable to resume (requested range not satisfiable)
712 # Open the connection again without the range header
713 data = urllib2.urlopen(basic_request)
714 content_length = data.info()['Content-Length']
715 except (urllib2.HTTPError, ), err:
716 if err.code < 500 or err.code >= 600:
719 # Examine the reported length
720 if (content_length is not None and
721 (resume_len - 100 < long(content_length) < resume_len + 100)):
722 # The file had already been fully downloaded.
723 # Explanation to the above condition: in issue #175 it was revealed that
724 # YouTube sometimes adds or removes a few bytes from the end of the file,
725 # changing the file size slightly and causing problems for some users. So
726 # I decided to implement a suggested change and consider the file
727 # completely downloaded if the file size differs less than 100 bytes from
728 # the one in the hard drive.
729 self.report_file_already_downloaded(filename)
730 self.try_rename(tmpfilename, filename)
733 # The length does not match, we start the download over
734 self.report_unable_to_resume()
740 self.report_retry(count, retries)
743 self.trouble(u'ERROR: giving up after %s retries' % retries)
746 data_len = data.info().get('Content-length', None)
747 if data_len is not None:
748 data_len = long(data_len) + resume_len
749 data_len_str = self.format_bytes(data_len)
750 byte_counter = 0 + resume_len
756 data_block = data.read(block_size)
758 if len(data_block) == 0:
760 byte_counter += len(data_block)
762 # Open file just in time
765 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
766 filename = self.undo_temp_name(tmpfilename)
767 self.report_destination(filename)
768 except (OSError, IOError), err:
769 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
772 stream.write(data_block)
773 except (IOError, OSError), err:
774 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
776 block_size = self.best_block_size(after - before, len(data_block))
779 percent_str = self.calc_percent(byte_counter, data_len)
780 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
781 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
782 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
785 self.slow_down(start, byte_counter - resume_len)
789 if data_len is not None and byte_counter != data_len:
790 raise ContentTooShortError(byte_counter, long(data_len))
791 self.try_rename(tmpfilename, filename)
793 # Update file modification time
794 if self.params.get('updatetime', True):
795 self.try_utime(filename, data.info().get('last-modified', None))
800 class InfoExtractor(object):
801 """Information Extractor class.
803 Information extractors are the classes that, given a URL, extract
804 information from the video (or videos) the URL refers to. This
805 information includes the real video URL, the video title and simplified
806 title, author and others. The information is stored in a dictionary
807 which is then passed to the FileDownloader. The FileDownloader
808 processes this information possibly downloading the video to the file
809 system, among other possible outcomes. The dictionaries must include
810 the following fields:
812 id: Video identifier.
813 url: Final video URL.
814 uploader: Nickname of the video uploader.
815 title: Literal title.
816 stitle: Simplified title.
817 ext: Video filename extension.
818 format: Video format.
819 player_url: SWF Player URL (may be None).
821 The following fields are optional. Their primary purpose is to allow
822 youtube-dl to serve as the backend for a video search function, such
823 as the one in youtube2mp3. They are only used when their respective
824 forced printing functions are called:
826 thumbnail: Full URL to a video thumbnail image.
827 description: One-line video description.
829 Subclasses of this one should re-define the _real_initialize() and
830 _real_extract() methods, as well as the suitable() static method.
831 Probably, they should also be instantiated and added to the main
838 def __init__(self, downloader=None):
839 """Constructor. Receives an optional downloader."""
841 self.set_downloader(downloader)
845 """Receives a URL and returns True if suitable for this IE."""
848 def initialize(self):
849 """Initializes an instance (authentication, etc)."""
851 self._real_initialize()
854 def extract(self, url):
855 """Extracts URL information and returns it in list of dicts."""
857 return self._real_extract(url)
859 def set_downloader(self, downloader):
860 """Sets the downloader for this IE."""
861 self._downloader = downloader
863 def _real_initialize(self):
864 """Real initialization process. Redefine in subclasses."""
867 def _real_extract(self, url):
868 """Real extraction process. Redefine in subclasses."""
872 class YoutubeIE(InfoExtractor):
873 """Information extractor for youtube.com."""
875 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
876 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
877 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
878 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
879 _NETRC_MACHINE = 'youtube'
880 # Listed in order of quality
881 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
882 _video_extensions = {
888 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
895 return (re.match(YoutubeIE._VALID_URL, url) is not None)
897 def report_lang(self):
898 """Report attempt to set language."""
899 self._downloader.to_screen(u'[youtube] Setting language')
901 def report_login(self):
902 """Report attempt to log in."""
903 self._downloader.to_screen(u'[youtube] Logging in')
905 def report_age_confirmation(self):
906 """Report attempt to confirm age."""
907 self._downloader.to_screen(u'[youtube] Confirming age')
909 def report_video_webpage_download(self, video_id):
910 """Report attempt to download video webpage."""
911 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
913 def report_video_info_webpage_download(self, video_id):
914 """Report attempt to download video info webpage."""
915 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
917 def report_information_extraction(self, video_id):
918 """Report attempt to extract video information."""
919 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
921 def report_unavailable_format(self, video_id, format):
922 """Report extracted video URL."""
923 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
925 def report_rtmp_download(self):
926 """Indicate the download will use the RTMP protocol."""
927 self._downloader.to_screen(u'[youtube] RTMP download detected')
929 def _real_initialize(self):
930 if self._downloader is None:
935 downloader_params = self._downloader.params
937 # Attempt to use provided username and password or .netrc data
938 if downloader_params.get('username', None) is not None:
939 username = downloader_params['username']
940 password = downloader_params['password']
941 elif downloader_params.get('usenetrc', False):
943 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
948 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
949 except (IOError, netrc.NetrcParseError), err:
950 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
954 request = urllib2.Request(self._LANG_URL)
957 urllib2.urlopen(request).read()
958 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
959 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
962 # No authentication to be performed
968 'current_form': 'loginForm',
970 'action_login': 'Log In',
971 'username': username,
972 'password': password,
974 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
977 login_results = urllib2.urlopen(request).read()
978 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
979 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
981 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
982 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
988 'action_confirm': 'Confirm',
990 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
992 self.report_age_confirmation()
993 age_results = urllib2.urlopen(request).read()
994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
995 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
998 def _real_extract(self, url):
999 # Extract video id from URL
1000 mobj = re.match(self._VALID_URL, url)
1002 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1004 video_id = mobj.group(2)
1007 self.report_video_webpage_download(video_id)
1008 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1010 video_webpage = urllib2.urlopen(request).read()
1011 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1012 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1015 # Attempt to extract SWF player URL
1016 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1017 if mobj is not None:
1018 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1023 self.report_video_info_webpage_download(video_id)
1024 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1025 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1026 % (video_id, el_type))
1027 request = urllib2.Request(video_info_url)
1029 video_info_webpage = urllib2.urlopen(request).read()
1030 video_info = parse_qs(video_info_webpage)
1031 if 'token' in video_info:
1033 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1034 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1036 if 'token' not in video_info:
1037 if 'reason' in video_info:
1038 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1040 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1043 # Start extracting information
1044 self.report_information_extraction(video_id)
1047 if 'author' not in video_info:
1048 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1050 video_uploader = urllib.unquote_plus(video_info['author'][0])
1053 if 'title' not in video_info:
1054 self._downloader.trouble(u'ERROR: unable to extract video title')
1056 video_title = urllib.unquote_plus(video_info['title'][0])
1057 video_title = video_title.decode('utf-8')
1058 video_title = sanitize_title(video_title)
1061 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1062 simple_title = simple_title.strip(ur'_')
1065 if 'thumbnail_url' not in video_info:
1066 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1067 video_thumbnail = ''
1068 else: # don't panic if we can't find it
1069 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1073 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1074 if mobj is not None:
1075 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1076 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1077 for expression in format_expressions:
1079 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1084 video_description = 'No description available.'
1085 if self._downloader.params.get('forcedescription', False):
1086 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1087 if mobj is not None:
1088 video_description = mobj.group(1)
1091 video_token = urllib.unquote_plus(video_info['token'][0])
1093 # Decide which formats to download
1094 req_format = self._downloader.params.get('format', None)
1096 if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1097 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1098 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1099 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1100 format_limit = self._downloader.params.get('format_limit', None)
1101 if format_limit is not None and format_limit in self._available_formats:
1102 format_list = self._available_formats[self._available_formats.index(format_limit):]
1104 format_list = self._available_formats
1105 existing_formats = [x for x in format_list if x in url_map]
1106 if len(existing_formats) == 0:
1107 self._downloader.trouble(u'ERROR: no known formats available for video')
1109 if req_format is None:
1110 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1111 elif req_format == '-1':
1112 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1115 if req_format not in url_map:
1116 self._downloader.trouble(u'ERROR: requested format not available')
1118 video_url_list = [(req_format, url_map[req_format])] # Specific format
1120 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1121 self.report_rtmp_download()
1122 video_url_list = [(None, video_info['conn'][0])]
1125 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1128 for format_param, video_real_url in video_url_list:
1129 # At this point we have a new video
1130 self._downloader.increment_downloads()
1133 video_extension = self._video_extensions.get(format_param, 'flv')
1135 # Find the video URL in fmt_url_map or conn paramters
1137 # Process video information
1138 self._downloader.process_info({
1139 'id': video_id.decode('utf-8'),
1140 'url': video_real_url.decode('utf-8'),
1141 'uploader': video_uploader.decode('utf-8'),
1142 'upload_date': upload_date,
1143 'title': video_title,
1144 'stitle': simple_title,
1145 'ext': video_extension.decode('utf-8'),
1146 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1147 'thumbnail': video_thumbnail.decode('utf-8'),
1148 'description': video_description.decode('utf-8'),
1149 'player_url': player_url,
1151 except UnavailableVideoError, err:
1152 self._downloader.trouble(u'\nERROR: unable to download video')
1155 class MetacafeIE(InfoExtractor):
1156 """Information Extractor for metacafe.com."""
1158 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1159 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1160 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1163 def __init__(self, youtube_ie, downloader=None):
1164 InfoExtractor.__init__(self, downloader)
1165 self._youtube_ie = youtube_ie
1169 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1171 def report_disclaimer(self):
1172 """Report disclaimer retrieval."""
1173 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1175 def report_age_confirmation(self):
1176 """Report attempt to confirm age."""
1177 self._downloader.to_screen(u'[metacafe] Confirming age')
1179 def report_download_webpage(self, video_id):
1180 """Report webpage download."""
1181 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1183 def report_extraction(self, video_id):
1184 """Report information extraction."""
1185 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1187 def _real_initialize(self):
1188 # Retrieve disclaimer
1189 request = urllib2.Request(self._DISCLAIMER)
1191 self.report_disclaimer()
1192 disclaimer = urllib2.urlopen(request).read()
1193 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1194 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1200 'submit': "Continue - I'm over 18",
1202 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1204 self.report_age_confirmation()
1205 disclaimer = urllib2.urlopen(request).read()
1206 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1207 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1210 def _real_extract(self, url):
1211 # Extract id and simplified title from URL
1212 mobj = re.match(self._VALID_URL, url)
1214 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1217 video_id = mobj.group(1)
1219 # Check if video comes from YouTube
1220 mobj2 = re.match(r'^yt-(.*)$', video_id)
1221 if mobj2 is not None:
1222 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1225 # At this point we have a new video
1226 self._downloader.increment_downloads()
1228 simple_title = mobj.group(2).decode('utf-8')
1230 # Retrieve video webpage to extract further information
1231 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1233 self.report_download_webpage(video_id)
1234 webpage = urllib2.urlopen(request).read()
1235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1236 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1239 # Extract URL, uploader and title from webpage
1240 self.report_extraction(video_id)
1241 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1242 if mobj is not None:
1243 mediaURL = urllib.unquote(mobj.group(1))
1244 video_extension = mediaURL[-3:]
1246 # Extract gdaKey if available
1247 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1249 video_url = mediaURL
1251 gdaKey = mobj.group(1)
1252 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1254 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1256 self._downloader.trouble(u'ERROR: unable to extract media URL')
1258 vardict = parse_qs(mobj.group(1))
1259 if 'mediaData' not in vardict:
1260 self._downloader.trouble(u'ERROR: unable to extract media URL')
1262 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1264 self._downloader.trouble(u'ERROR: unable to extract media URL')
1266 mediaURL = mobj.group(1).replace('\\/', '/')
1267 video_extension = mediaURL[-3:]
1268 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1270 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1272 self._downloader.trouble(u'ERROR: unable to extract title')
1274 video_title = mobj.group(1).decode('utf-8')
1275 video_title = sanitize_title(video_title)
1277 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1279 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1281 video_uploader = mobj.group(1)
1284 # Process video information
1285 self._downloader.process_info({
1286 'id': video_id.decode('utf-8'),
1287 'url': video_url.decode('utf-8'),
1288 'uploader': video_uploader.decode('utf-8'),
1289 'upload_date': u'NA',
1290 'title': video_title,
1291 'stitle': simple_title,
1292 'ext': video_extension.decode('utf-8'),
1296 except UnavailableVideoError:
1297 self._downloader.trouble(u'\nERROR: unable to download video')
1300 class DailymotionIE(InfoExtractor):
1301 """Information Extractor for Dailymotion"""
1303 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1305 def __init__(self, downloader=None):
1306 InfoExtractor.__init__(self, downloader)
1310 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1312 def report_download_webpage(self, video_id):
1313 """Report webpage download."""
1314 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1316 def report_extraction(self, video_id):
1317 """Report information extraction."""
1318 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1320 def _real_initialize(self):
1323 def _real_extract(self, url):
1324 # Extract id and simplified title from URL
1325 mobj = re.match(self._VALID_URL, url)
1327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1330 # At this point we have a new video
1331 self._downloader.increment_downloads()
1332 video_id = mobj.group(1)
1334 simple_title = mobj.group(2).decode('utf-8')
1335 video_extension = 'flv'
1337 # Retrieve video webpage to extract further information
1338 request = urllib2.Request(url)
1340 self.report_download_webpage(video_id)
1341 webpage = urllib2.urlopen(request).read()
1342 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1343 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1346 # Extract URL, uploader and title from webpage
1347 self.report_extraction(video_id)
1348 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1350 self._downloader.trouble(u'ERROR: unable to extract media URL')
1352 mediaURL = urllib.unquote(mobj.group(1))
1354 # if needed add http://www.dailymotion.com/ if relative URL
1356 video_url = mediaURL
1358 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1359 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1361 self._downloader.trouble(u'ERROR: unable to extract title')
1363 video_title = mobj.group(1).decode('utf-8')
1364 video_title = sanitize_title(video_title)
1366 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1368 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1370 video_uploader = mobj.group(1)
1373 # Process video information
1374 self._downloader.process_info({
1375 'id': video_id.decode('utf-8'),
1376 'url': video_url.decode('utf-8'),
1377 'uploader': video_uploader.decode('utf-8'),
1378 'upload_date': u'NA',
1379 'title': video_title,
1380 'stitle': simple_title,
1381 'ext': video_extension.decode('utf-8'),
1385 except UnavailableVideoError:
1386 self._downloader.trouble(u'\nERROR: unable to download video')
1389 class GoogleIE(InfoExtractor):
1390 """Information extractor for video.google.com."""
1392 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1394 def __init__(self, downloader=None):
1395 InfoExtractor.__init__(self, downloader)
1399 return (re.match(GoogleIE._VALID_URL, url) is not None)
1401 def report_download_webpage(self, video_id):
1402 """Report webpage download."""
1403 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1405 def report_extraction(self, video_id):
1406 """Report information extraction."""
1407 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1409 def _real_initialize(self):
1412 def _real_extract(self, url):
1413 # Extract id from URL
1414 mobj = re.match(self._VALID_URL, url)
1416 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1419 # At this point we have a new video
1420 self._downloader.increment_downloads()
1421 video_id = mobj.group(1)
1423 video_extension = 'mp4'
1425 # Retrieve video webpage to extract further information
1426 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1428 self.report_download_webpage(video_id)
1429 webpage = urllib2.urlopen(request).read()
1430 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1431 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1434 # Extract URL, uploader, and title from webpage
1435 self.report_extraction(video_id)
1436 mobj = re.search(r"download_url:'([^']+)'", webpage)
1438 video_extension = 'flv'
1439 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1441 self._downloader.trouble(u'ERROR: unable to extract media URL')
1443 mediaURL = urllib.unquote(mobj.group(1))
1444 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1445 mediaURL = mediaURL.replace('\\x26', '\x26')
1447 video_url = mediaURL
1449 mobj = re.search(r'<title>(.*)</title>', webpage)
1451 self._downloader.trouble(u'ERROR: unable to extract title')
1453 video_title = mobj.group(1).decode('utf-8')
1454 video_title = sanitize_title(video_title)
1455 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1457 # Extract video description
1458 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1460 self._downloader.trouble(u'ERROR: unable to extract video description')
1462 video_description = mobj.group(1).decode('utf-8')
1463 if not video_description:
1464 video_description = 'No description available.'
1466 # Extract video thumbnail
1467 if self._downloader.params.get('forcethumbnail', False):
1468 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1470 webpage = urllib2.urlopen(request).read()
1471 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1474 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1476 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1478 video_thumbnail = mobj.group(1)
1479 else: # we need something to pass to process_info
1480 video_thumbnail = ''
1483 # Process video information
1484 self._downloader.process_info({
1485 'id': video_id.decode('utf-8'),
1486 'url': video_url.decode('utf-8'),
1488 'upload_date': u'NA',
1489 'title': video_title,
1490 'stitle': simple_title,
1491 'ext': video_extension.decode('utf-8'),
1495 except UnavailableVideoError:
1496 self._downloader.trouble(u'\nERROR: unable to download video')
1499 class PhotobucketIE(InfoExtractor):
1500 """Information extractor for photobucket.com."""
1502 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1504 def __init__(self, downloader=None):
1505 InfoExtractor.__init__(self, downloader)
1509 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1511 def report_download_webpage(self, video_id):
1512 """Report webpage download."""
1513 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1515 def report_extraction(self, video_id):
1516 """Report information extraction."""
1517 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1519 def _real_initialize(self):
1522 def _real_extract(self, url):
1523 # Extract id from URL
1524 mobj = re.match(self._VALID_URL, url)
1526 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1529 # At this point we have a new video
1530 self._downloader.increment_downloads()
1531 video_id = mobj.group(1)
1533 video_extension = 'flv'
1535 # Retrieve video webpage to extract further information
1536 request = urllib2.Request(url)
1538 self.report_download_webpage(video_id)
1539 webpage = urllib2.urlopen(request).read()
1540 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1541 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1544 # Extract URL, uploader, and title from webpage
1545 self.report_extraction(video_id)
1546 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1548 self._downloader.trouble(u'ERROR: unable to extract media URL')
1550 mediaURL = urllib.unquote(mobj.group(1))
1552 video_url = mediaURL
1554 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1556 self._downloader.trouble(u'ERROR: unable to extract title')
1558 video_title = mobj.group(1).decode('utf-8')
1559 video_title = sanitize_title(video_title)
1560 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1562 video_uploader = mobj.group(2).decode('utf-8')
1565 # Process video information
1566 self._downloader.process_info({
1567 'id': video_id.decode('utf-8'),
1568 'url': video_url.decode('utf-8'),
1569 'uploader': video_uploader,
1570 'upload_date': u'NA',
1571 'title': video_title,
1572 'stitle': simple_title,
1573 'ext': video_extension.decode('utf-8'),
1577 except UnavailableVideoError:
1578 self._downloader.trouble(u'\nERROR: unable to download video')
1581 class YahooIE(InfoExtractor):
1582 """Information extractor for video.yahoo.com."""
1584 # _VALID_URL matches all Yahoo! Video URLs
1585 # _VPAGE_URL matches only the extractable '/watch/' URLs
1586 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1587 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1589 def __init__(self, downloader=None):
1590 InfoExtractor.__init__(self, downloader)
1594 return (re.match(YahooIE._VALID_URL, url) is not None)
1596 def report_download_webpage(self, video_id):
1597 """Report webpage download."""
1598 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1600 def report_extraction(self, video_id):
1601 """Report information extraction."""
1602 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1604 def _real_initialize(self):
1607 def _real_extract(self, url, new_video=True):
1608 # Extract ID from URL
1609 mobj = re.match(self._VALID_URL, url)
1611 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1614 # At this point we have a new video
1615 self._downloader.increment_downloads()
1616 video_id = mobj.group(2)
1617 video_extension = 'flv'
1619 # Rewrite valid but non-extractable URLs as
1620 # extractable English language /watch/ URLs
1621 if re.match(self._VPAGE_URL, url) is None:
1622 request = urllib2.Request(url)
1624 webpage = urllib2.urlopen(request).read()
1625 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1626 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1629 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1631 self._downloader.trouble(u'ERROR: Unable to extract id field')
1633 yahoo_id = mobj.group(1)
1635 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1637 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1639 yahoo_vid = mobj.group(1)
1641 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1642 return self._real_extract(url, new_video=False)
1644 # Retrieve video webpage to extract further information
1645 request = urllib2.Request(url)
1647 self.report_download_webpage(video_id)
1648 webpage = urllib2.urlopen(request).read()
1649 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1650 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1653 # Extract uploader and title from webpage
1654 self.report_extraction(video_id)
1655 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1657 self._downloader.trouble(u'ERROR: unable to extract video title')
1659 video_title = mobj.group(1).decode('utf-8')
1660 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1662 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1664 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1666 video_uploader = mobj.group(1).decode('utf-8')
1668 # Extract video thumbnail
1669 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1671 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1673 video_thumbnail = mobj.group(1).decode('utf-8')
1675 # Extract video description
1676 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract video description')
1680 video_description = mobj.group(1).decode('utf-8')
1681 if not video_description:
1682 video_description = 'No description available.'
1684 # Extract video height and width
1685 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1687 self._downloader.trouble(u'ERROR: unable to extract video height')
1689 yv_video_height = mobj.group(1)
1691 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1693 self._downloader.trouble(u'ERROR: unable to extract video width')
1695 yv_video_width = mobj.group(1)
1697 # Retrieve video playlist to extract media URL
1698 # I'm not completely sure what all these options are, but we
1699 # seem to need most of them, otherwise the server sends a 401.
1700 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1701 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1702 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1703 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1704 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1706 self.report_download_webpage(video_id)
1707 webpage = urllib2.urlopen(request).read()
1708 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1709 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1712 # Extract media URL from playlist XML
1713 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1715 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1717 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1718 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1721 # Process video information
1722 self._downloader.process_info({
1723 'id': video_id.decode('utf-8'),
1725 'uploader': video_uploader,
1726 'upload_date': u'NA',
1727 'title': video_title,
1728 'stitle': simple_title,
1729 'ext': video_extension.decode('utf-8'),
1730 'thumbnail': video_thumbnail.decode('utf-8'),
1731 'description': video_description,
1732 'thumbnail': video_thumbnail,
1733 'description': video_description,
1736 except UnavailableVideoError:
1737 self._downloader.trouble(u'\nERROR: unable to download video')
1740 class GenericIE(InfoExtractor):
1741 """Generic last-resort information extractor."""
1743 def __init__(self, downloader=None):
1744 InfoExtractor.__init__(self, downloader)
1750 def report_download_webpage(self, video_id):
1751 """Report webpage download."""
1752 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1753 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1755 def report_extraction(self, video_id):
1756 """Report information extraction."""
1757 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1759 def _real_initialize(self):
1762 def _real_extract(self, url):
1763 # At this point we have a new video
1764 self._downloader.increment_downloads()
1766 video_id = url.split('/')[-1]
1767 request = urllib2.Request(url)
1769 self.report_download_webpage(video_id)
1770 webpage = urllib2.urlopen(request).read()
1771 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1772 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1774 except ValueError, err:
1775 # since this is the last-resort InfoExtractor, if
1776 # this error is thrown, it'll be thrown here
1777 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1780 self.report_extraction(video_id)
1781 # Start with something easy: JW Player in SWFObject
1782 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1784 # Broaden the search a little bit
1785 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1790 # It's possible that one of the regexes
1791 # matched, but returned an empty group:
1792 if mobj.group(1) is None:
1793 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1796 video_url = urllib.unquote(mobj.group(1))
1797 video_id = os.path.basename(video_url)
1799 # here's a fun little line of code for you:
1800 video_extension = os.path.splitext(video_id)[1][1:]
1801 video_id = os.path.splitext(video_id)[0]
1803 # it's tempting to parse this further, but you would
1804 # have to take into account all the variations like
1805 # Video Title - Site Name
1806 # Site Name | Video Title
1807 # Video Title - Tagline | Site Name
1808 # and so on and so forth; it's just not practical
1809 mobj = re.search(r'<title>(.*)</title>', webpage)
1811 self._downloader.trouble(u'ERROR: unable to extract title')
1813 video_title = mobj.group(1).decode('utf-8')
1814 video_title = sanitize_title(video_title)
1815 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1817 # video uploader is domain name
1818 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1820 self._downloader.trouble(u'ERROR: unable to extract title')
1822 video_uploader = mobj.group(1).decode('utf-8')
1825 # Process video information
1826 self._downloader.process_info({
1827 'id': video_id.decode('utf-8'),
1828 'url': video_url.decode('utf-8'),
1829 'uploader': video_uploader,
1830 'upload_date': u'NA',
1831 'title': video_title,
1832 'stitle': simple_title,
1833 'ext': video_extension.decode('utf-8'),
1837 except UnavailableVideoError, err:
1838 self._downloader.trouble(u'\nERROR: unable to download video')
1841 class YoutubeSearchIE(InfoExtractor):
1842 """Information Extractor for YouTube search queries."""
1843 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1844 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1845 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1846 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1848 _max_youtube_results = 1000
1850 def __init__(self, youtube_ie, downloader=None):
1851 InfoExtractor.__init__(self, downloader)
1852 self._youtube_ie = youtube_ie
1856 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1858 def report_download_page(self, query, pagenum):
1859 """Report attempt to download playlist page with given number."""
1860 query = query.decode(preferredencoding())
1861 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1863 def _real_initialize(self):
1864 self._youtube_ie.initialize()
1866 def _real_extract(self, query):
1867 mobj = re.match(self._VALID_QUERY, query)
1869 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1872 prefix, query = query.split(':')
1874 query = query.encode('utf-8')
1876 self._download_n_results(query, 1)
1878 elif prefix == 'all':
1879 self._download_n_results(query, self._max_youtube_results)
1885 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1887 elif n > self._max_youtube_results:
1888 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1889 n = self._max_youtube_results
1890 self._download_n_results(query, n)
1892 except ValueError: # parsing prefix as integer fails
1893 self._download_n_results(query, 1)
1896 def _download_n_results(self, query, n):
1897 """Downloads a specified number of results for a query"""
1900 already_seen = set()
1904 self.report_download_page(query, pagenum)
1905 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1906 request = urllib2.Request(result_url)
1908 page = urllib2.urlopen(request).read()
1909 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1910 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1913 # Extract video identifiers
1914 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1915 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1916 if video_id not in already_seen:
1917 video_ids.append(video_id)
1918 already_seen.add(video_id)
1919 if len(video_ids) == n:
1920 # Specified n videos reached
1921 for id in video_ids:
1922 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1925 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1926 for id in video_ids:
1927 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1930 pagenum = pagenum + 1
1933 class GoogleSearchIE(InfoExtractor):
1934 """Information Extractor for Google Video search queries."""
1935 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1936 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1937 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1938 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1940 _max_google_results = 1000
1942 def __init__(self, google_ie, downloader=None):
1943 InfoExtractor.__init__(self, downloader)
1944 self._google_ie = google_ie
1948 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1950 def report_download_page(self, query, pagenum):
1951 """Report attempt to download playlist page with given number."""
1952 query = query.decode(preferredencoding())
1953 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1955 def _real_initialize(self):
1956 self._google_ie.initialize()
1958 def _real_extract(self, query):
1959 mobj = re.match(self._VALID_QUERY, query)
1961 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1964 prefix, query = query.split(':')
1966 query = query.encode('utf-8')
1968 self._download_n_results(query, 1)
1970 elif prefix == 'all':
1971 self._download_n_results(query, self._max_google_results)
1977 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1979 elif n > self._max_google_results:
1980 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1981 n = self._max_google_results
1982 self._download_n_results(query, n)
1984 except ValueError: # parsing prefix as integer fails
1985 self._download_n_results(query, 1)
1988 def _download_n_results(self, query, n):
1989 """Downloads a specified number of results for a query"""
1992 already_seen = set()
1996 self.report_download_page(query, pagenum)
1997 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1998 request = urllib2.Request(result_url)
2000 page = urllib2.urlopen(request).read()
2001 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2002 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2005 # Extract video identifiers
2006 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2007 video_id = mobj.group(1)
2008 if video_id not in already_seen:
2009 video_ids.append(video_id)
2010 already_seen.add(video_id)
2011 if len(video_ids) == n:
2012 # Specified n videos reached
2013 for id in video_ids:
2014 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2017 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2018 for id in video_ids:
2019 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2022 pagenum = pagenum + 1
2025 class YahooSearchIE(InfoExtractor):
2026 """Information Extractor for Yahoo! Video search queries."""
2027 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2028 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2029 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2030 _MORE_PAGES_INDICATOR = r'\s*Next'
2032 _max_yahoo_results = 1000
2034 def __init__(self, yahoo_ie, downloader=None):
2035 InfoExtractor.__init__(self, downloader)
2036 self._yahoo_ie = yahoo_ie
2040 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2042 def report_download_page(self, query, pagenum):
2043 """Report attempt to download playlist page with given number."""
2044 query = query.decode(preferredencoding())
2045 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2047 def _real_initialize(self):
2048 self._yahoo_ie.initialize()
2050 def _real_extract(self, query):
2051 mobj = re.match(self._VALID_QUERY, query)
2053 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2056 prefix, query = query.split(':')
2058 query = query.encode('utf-8')
2060 self._download_n_results(query, 1)
2062 elif prefix == 'all':
2063 self._download_n_results(query, self._max_yahoo_results)
2069 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2071 elif n > self._max_yahoo_results:
2072 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2073 n = self._max_yahoo_results
2074 self._download_n_results(query, n)
2076 except ValueError: # parsing prefix as integer fails
2077 self._download_n_results(query, 1)
2080 def _download_n_results(self, query, n):
2081 """Downloads a specified number of results for a query"""
2084 already_seen = set()
2088 self.report_download_page(query, pagenum)
2089 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2090 request = urllib2.Request(result_url)
2092 page = urllib2.urlopen(request).read()
2093 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2094 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2097 # Extract video identifiers
2098 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2099 video_id = mobj.group(1)
2100 if video_id not in already_seen:
2101 video_ids.append(video_id)
2102 already_seen.add(video_id)
2103 if len(video_ids) == n:
2104 # Specified n videos reached
2105 for id in video_ids:
2106 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2109 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2110 for id in video_ids:
2111 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2114 pagenum = pagenum + 1
2117 class YoutubePlaylistIE(InfoExtractor):
2118 """Information Extractor for YouTube playlists."""
2120 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2121 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2122 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2123 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2126 def __init__(self, youtube_ie, downloader=None):
2127 InfoExtractor.__init__(self, downloader)
2128 self._youtube_ie = youtube_ie
2132 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2134 def report_download_page(self, playlist_id, pagenum):
2135 """Report attempt to download playlist page with given number."""
2136 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2138 def _real_initialize(self):
2139 self._youtube_ie.initialize()
2141 def _real_extract(self, url):
2142 # Extract playlist id
2143 mobj = re.match(self._VALID_URL, url)
2145 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2149 if mobj.group(3) is not None:
2150 self._youtube_ie.extract(mobj.group(3))
2153 # Download playlist pages
2154 # prefix is 'p' as default for playlists but there are other types that need extra care
2155 playlist_prefix = mobj.group(1)
2156 if playlist_prefix == 'a':
2157 playlist_access = 'artist'
2159 playlist_prefix = 'p'
2160 playlist_access = 'view_play_list'
2161 playlist_id = mobj.group(2)
2166 self.report_download_page(playlist_id, pagenum)
2167 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2169 page = urllib2.urlopen(request).read()
2170 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2171 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2174 # Extract video identifiers
2176 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2177 if mobj.group(1) not in ids_in_page:
2178 ids_in_page.append(mobj.group(1))
2179 video_ids.extend(ids_in_page)
2181 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2183 pagenum = pagenum + 1
2185 playliststart = self._downloader.params.get('playliststart', 1) - 1
2186 playlistend = self._downloader.params.get('playlistend', -1)
2187 video_ids = video_ids[playliststart:playlistend]
2189 for id in video_ids:
2190 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2194 class YoutubeUserIE(InfoExtractor):
2195 """Information Extractor for YouTube users."""
2197 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2198 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2199 _GDATA_PAGE_SIZE = 50
2200 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2201 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2204 def __init__(self, youtube_ie, downloader=None):
2205 InfoExtractor.__init__(self, downloader)
2206 self._youtube_ie = youtube_ie
2210 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2212 def report_download_page(self, username, start_index):
2213 """Report attempt to download user page."""
2214 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2215 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2217 def _real_initialize(self):
2218 self._youtube_ie.initialize()
2220 def _real_extract(self, url):
2222 mobj = re.match(self._VALID_URL, url)
2224 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2227 username = mobj.group(1)
2229 # Download video ids using YouTube Data API. Result size per
2230 # query is limited (currently to 50 videos) so we need to query
2231 # page by page until there are no video ids - it means we got
2238 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2239 self.report_download_page(username, start_index)
2241 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2244 page = urllib2.urlopen(request).read()
2245 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2246 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2249 # Extract video identifiers
2252 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2253 if mobj.group(1) not in ids_in_page:
2254 ids_in_page.append(mobj.group(1))
2256 video_ids.extend(ids_in_page)
2258 # A little optimization - if current page is not
2259 # "full", ie. does not contain PAGE_SIZE video ids then
2260 # we can assume that this page is the last one - there
2261 # are no more ids on further pages - no need to query
2264 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2269 all_ids_count = len(video_ids)
2270 playliststart = self._downloader.params.get('playliststart', 1) - 1
2271 playlistend = self._downloader.params.get('playlistend', -1)
2273 if playlistend == -1:
2274 video_ids = video_ids[playliststart:]
2276 video_ids = video_ids[playliststart:playlistend]
2278 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2279 (username, all_ids_count, len(video_ids)))
2281 for video_id in video_ids:
2282 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2285 class DepositFilesIE(InfoExtractor):
2286 """Information extractor for depositfiles.com"""
2288 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2290 def __init__(self, downloader=None):
2291 InfoExtractor.__init__(self, downloader)
2295 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2297 def report_download_webpage(self, file_id):
2298 """Report webpage download."""
2299 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2301 def report_extraction(self, file_id):
2302 """Report information extraction."""
2303 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2305 def _real_initialize(self):
2308 def _real_extract(self, url):
2309 # At this point we have a new file
2310 self._downloader.increment_downloads()
2312 file_id = url.split('/')[-1]
2313 # Rebuild url in english locale
2314 url = 'http://depositfiles.com/en/files/' + file_id
2316 # Retrieve file webpage with 'Free download' button pressed
2317 free_download_indication = { 'gateway_result' : '1' }
2318 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2320 self.report_download_webpage(file_id)
2321 webpage = urllib2.urlopen(request).read()
2322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2326 # Search for the real file URL
2327 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2328 if (mobj is None) or (mobj.group(1) is None):
2329 # Try to figure out reason of the error.
2330 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2331 if (mobj is not None) and (mobj.group(1) is not None):
2332 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2333 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2335 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2338 file_url = mobj.group(1)
2339 file_extension = os.path.splitext(file_url)[1][1:]
2341 # Search for file title
2342 mobj = re.search(r'<b title="(.*?)">', webpage)
2344 self._downloader.trouble(u'ERROR: unable to extract title')
2346 file_title = mobj.group(1).decode('utf-8')
2349 # Process file information
2350 self._downloader.process_info({
2351 'id': file_id.decode('utf-8'),
2352 'url': file_url.decode('utf-8'),
2354 'upload_date': u'NA',
2355 'title': file_title,
2356 'stitle': file_title,
2357 'ext': file_extension.decode('utf-8'),
2361 except UnavailableVideoError, err:
2362 self._downloader.trouble(u'ERROR: unable to download file')
2365 class FacebookIE(InfoExtractor):
2366 """Information Extractor for Facebook"""
2368 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2369 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2370 _NETRC_MACHINE = 'facebook'
2371 _available_formats = ['highqual', 'lowqual']
2372 _video_extensions = {
2377 def __init__(self, downloader=None):
2378 InfoExtractor.__init__(self, downloader)
2382 return (re.match(FacebookIE._VALID_URL, url) is not None)
2384 def _reporter(self, message):
2385 """Add header and report message."""
2386 self._downloader.to_screen(u'[facebook] %s' % message)
2388 def report_login(self):
2389 """Report attempt to log in."""
2390 self._reporter(u'Logging in')
2392 def report_video_webpage_download(self, video_id):
2393 """Report attempt to download video webpage."""
2394 self._reporter(u'%s: Downloading video webpage' % video_id)
2396 def report_information_extraction(self, video_id):
2397 """Report attempt to extract video information."""
2398 self._reporter(u'%s: Extracting video information' % video_id)
2400 def _parse_page(self, video_webpage):
2401 """Extract video information from page"""
2403 data = {'title': r'class="video_title datawrap">(.*?)</',
2404 'description': r'<div class="datawrap">(.*?)</div>',
2405 'owner': r'\("video_owner_name", "(.*?)"\)',
2406 'upload_date': r'data-date="(.*?)"',
2407 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2410 for piece in data.keys():
2411 mobj = re.search(data[piece], video_webpage)
2412 if mobj is not None:
2413 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2417 for fmt in self._available_formats:
2418 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2419 if mobj is not None:
2420 # URL is in a Javascript segment inside an escaped Unicode format within
2421 # the generally utf-8 page
2422 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2423 video_info['video_urls'] = video_urls
2427 def _real_initialize(self):
2428 if self._downloader is None:
2433 downloader_params = self._downloader.params
2435 # Attempt to use provided username and password or .netrc data
2436 if downloader_params.get('username', None) is not None:
2437 useremail = downloader_params['username']
2438 password = downloader_params['password']
2439 elif downloader_params.get('usenetrc', False):
2441 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2442 if info is not None:
2446 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2447 except (IOError, netrc.NetrcParseError), err:
2448 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2451 if useremail is None:
2460 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2463 login_results = urllib2.urlopen(request).read()
2464 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2465 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2467 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2468 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2471 def _real_extract(self, url):
2472 mobj = re.match(self._VALID_URL, url)
2474 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2476 video_id = mobj.group('ID')
2479 self.report_video_webpage_download(video_id)
2480 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2482 page = urllib2.urlopen(request)
2483 video_webpage = page.read()
2484 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2485 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2488 # Start extracting information
2489 self.report_information_extraction(video_id)
2491 # Extract information
2492 video_info = self._parse_page(video_webpage)
2495 if 'owner' not in video_info:
2496 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2498 video_uploader = video_info['owner']
2501 if 'title' not in video_info:
2502 self._downloader.trouble(u'ERROR: unable to extract video title')
2504 video_title = video_info['title']
2505 video_title = video_title.decode('utf-8')
2506 video_title = sanitize_title(video_title)
2509 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2510 simple_title = simple_title.strip(ur'_')
2513 if 'thumbnail' not in video_info:
2514 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2515 video_thumbnail = ''
2517 video_thumbnail = video_info['thumbnail']
2521 if 'upload_date' in video_info:
2522 upload_time = video_info['upload_date']
2523 timetuple = email.utils.parsedate_tz(upload_time)
2524 if timetuple is not None:
2526 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2531 video_description = 'No description available.'
2532 if (self._downloader.params.get('forcedescription', False) and
2533 'description' in video_info):
2534 video_description = video_info['description']
2536 url_map = video_info['video_urls']
2537 if len(url_map.keys()) > 0:
2538 # Decide which formats to download
2539 req_format = self._downloader.params.get('format', None)
2540 format_limit = self._downloader.params.get('format_limit', None)
2542 if format_limit is not None and format_limit in self._available_formats:
2543 format_list = self._available_formats[self._available_formats.index(format_limit):]
2545 format_list = self._available_formats
2546 existing_formats = [x for x in format_list if x in url_map]
2547 if len(existing_formats) == 0:
2548 self._downloader.trouble(u'ERROR: no known formats available for video')
2550 if req_format is None:
2551 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2552 elif req_format == '-1':
2553 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2556 if req_format not in url_map:
2557 self._downloader.trouble(u'ERROR: requested format not available')
2559 video_url_list = [(req_format, url_map[req_format])] # Specific format
2561 for format_param, video_real_url in video_url_list:
2563 # At this point we have a new video
2564 self._downloader.increment_downloads()
2567 video_extension = self._video_extensions.get(format_param, 'mp4')
2569 # Find the video URL in fmt_url_map or conn paramters
2571 # Process video information
2572 self._downloader.process_info({
2573 'id': video_id.decode('utf-8'),
2574 'url': video_real_url.decode('utf-8'),
2575 'uploader': video_uploader.decode('utf-8'),
2576 'upload_date': upload_date,
2577 'title': video_title,
2578 'stitle': simple_title,
2579 'ext': video_extension.decode('utf-8'),
2580 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2581 'thumbnail': video_thumbnail.decode('utf-8'),
2582 'description': video_description.decode('utf-8'),
2585 except UnavailableVideoError, err:
2586 self._downloader.trouble(u'\nERROR: unable to download video')
2589 class PostProcessor(object):
2590 """Post Processor class.
2592 PostProcessor objects can be added to downloaders with their
2593 add_post_processor() method. When the downloader has finished a
2594 successful download, it will take its internal chain of PostProcessors
2595 and start calling the run() method on each one of them, first with
2596 an initial argument and then with the returned value of the previous
2599 The chain will be stopped if one of them ever returns None or the end
2600 of the chain is reached.
2602 PostProcessor objects follow a "mutual registration" process similar
2603 to InfoExtractor objects.
2608 def __init__(self, downloader=None):
2609 self._downloader = downloader
2611 def set_downloader(self, downloader):
2612 """Sets the downloader for this PP."""
2613 self._downloader = downloader
2615 def run(self, information):
2616 """Run the PostProcessor.
2618 The "information" argument is a dictionary like the ones
2619 composed by InfoExtractors. The only difference is that this
2620 one has an extra field called "filepath" that points to the
2623 When this method returns None, the postprocessing chain is
2624 stopped. However, this method may return an information
2625 dictionary that will be passed to the next postprocessing
2626 object in the chain. It can be the one it received after
2627 changing some fields.
2629 In addition, this method may raise a PostProcessingError
2630 exception that will be taken into account by the downloader
2633 return information # by default, do nothing
2636 class FFmpegExtractAudioPP(PostProcessor):
2638 def __init__(self, downloader=None, preferredcodec=None):
2639 PostProcessor.__init__(self, downloader)
2640 if preferredcodec is None:
2641 preferredcodec = 'best'
2642 self._preferredcodec = preferredcodec
2645 def get_audio_codec(path):
2647 cmd = ['ffprobe', '-show_streams', '--', path]
2648 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2649 output = handle.communicate()[0]
2650 if handle.wait() != 0:
2652 except (IOError, OSError):
2655 for line in output.split('\n'):
2656 if line.startswith('codec_name='):
2657 audio_codec = line.split('=')[1].strip()
2658 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2663 def run_ffmpeg(path, out_path, codec, more_opts):
2665 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2666 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2668 except (IOError, OSError):
2671 def run(self, information):
2672 path = information['filepath']
2674 filecodec = self.get_audio_codec(path)
2675 if filecodec is None:
2676 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2680 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2681 if filecodec == 'aac' or filecodec == 'mp3':
2682 # Lossless if possible
2684 extension = filecodec
2685 if filecodec == 'aac':
2686 more_opts = ['-f', 'adts']
2689 acodec = 'libmp3lame'
2691 more_opts = ['-ab', '128k']
2693 # We convert the audio (lossy)
2694 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2695 extension = self._preferredcodec
2696 more_opts = ['-ab', '128k']
2697 if self._preferredcodec == 'aac':
2698 more_opts += ['-f', 'adts']
2700 (prefix, ext) = os.path.splitext(path)
2701 new_path = prefix + '.' + extension
2702 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2703 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2706 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2711 except (IOError, OSError):
2712 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2715 information['filepath'] = new_path
2718 ### MAIN PROGRAM ###
2719 if __name__ == '__main__':
2721 # Modules needed only when running the main program
2725 # Function to update the program file with the latest version from the repository.
2726 def update_self(downloader, filename):
2727 # Note: downloader only used for options
2728 if not os.access(filename, os.W_OK):
2729 sys.exit('ERROR: no write permissions on %s' % filename)
2731 downloader.to_screen('Updating to latest stable version...')
2733 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2734 latest_version = urllib.urlopen(latest_url).read().strip()
2735 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2736 newcontent = urllib.urlopen(prog_url).read()
2737 except (IOError, OSError), err:
2738 sys.exit('ERROR: unable to download latest version')
2740 stream = open(filename, 'w')
2741 stream.write(newcontent)
2743 except (IOError, OSError), err:
2744 sys.exit('ERROR: unable to overwrite current version')
2745 downloader.to_screen('Updated to version %s' % latest_version)
2747 # Parse command line
2748 parser = optparse.OptionParser(
2749 usage='Usage: %prog [options] url...',
2750 version='2011.08.04',
2751 conflict_handler='resolve',
2754 parser.add_option('-h', '--help',
2755 action='help', help='print this help text and exit')
2756 parser.add_option('-v', '--version',
2757 action='version', help='print program version and exit')
2758 parser.add_option('-U', '--update',
2759 action='store_true', dest='update_self', help='update this program to latest stable version')
2760 parser.add_option('-i', '--ignore-errors',
2761 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2762 parser.add_option('-r', '--rate-limit',
2763 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2764 parser.add_option('-R', '--retries',
2765 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2766 parser.add_option('--playlist-start',
2767 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2768 parser.add_option('--playlist-end',
2769 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2770 parser.add_option('--dump-user-agent',
2771 action='store_true', dest='dump_user_agent',
2772 help='display the current browser identification', default=False)
2774 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2775 authentication.add_option('-u', '--username',
2776 dest='username', metavar='USERNAME', help='account username')
2777 authentication.add_option('-p', '--password',
2778 dest='password', metavar='PASSWORD', help='account password')
2779 authentication.add_option('-n', '--netrc',
2780 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2781 parser.add_option_group(authentication)
2783 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2784 video_format.add_option('-f', '--format',
2785 action='store', dest='format', metavar='FORMAT', help='video format code')
2786 video_format.add_option('--all-formats',
2787 action='store_const', dest='format', help='download all available video formats', const='-1')
2788 video_format.add_option('--max-quality',
2789 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2790 parser.add_option_group(video_format)
2792 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2793 verbosity.add_option('-q', '--quiet',
2794 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2795 verbosity.add_option('-s', '--simulate',
2796 action='store_true', dest='simulate', help='do not download video', default=False)
2797 verbosity.add_option('-g', '--get-url',
2798 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2799 verbosity.add_option('-e', '--get-title',
2800 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2801 verbosity.add_option('--get-thumbnail',
2802 action='store_true', dest='getthumbnail',
2803 help='simulate, quiet but print thumbnail URL', default=False)
2804 verbosity.add_option('--get-description',
2805 action='store_true', dest='getdescription',
2806 help='simulate, quiet but print video description', default=False)
2807 verbosity.add_option('--get-filename',
2808 action='store_true', dest='getfilename',
2809 help='simulate, quiet but print output filename', default=False)
2810 verbosity.add_option('--no-progress',
2811 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2812 verbosity.add_option('--console-title',
2813 action='store_true', dest='consoletitle',
2814 help='display progress in console titlebar', default=False)
2815 parser.add_option_group(verbosity)
2817 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2818 filesystem.add_option('-t', '--title',
2819 action='store_true', dest='usetitle', help='use title in file name', default=False)
2820 filesystem.add_option('-l', '--literal',
2821 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2822 filesystem.add_option('-A', '--auto-number',
2823 action='store_true', dest='autonumber',
2824 help='number downloaded files starting from 00000', default=False)
2825 filesystem.add_option('-o', '--output',
2826 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2827 filesystem.add_option('-a', '--batch-file',
2828 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2829 filesystem.add_option('-w', '--no-overwrites',
2830 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2831 filesystem.add_option('-c', '--continue',
2832 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2833 filesystem.add_option('--cookies',
2834 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2835 filesystem.add_option('--no-part',
2836 action='store_true', dest='nopart', help='do not use .part files', default=False)
2837 filesystem.add_option('--no-mtime',
2838 action='store_false', dest='updatetime',
2839 help='do not use the Last-modified header to set the file modification time', default=True)
2840 parser.add_option_group(filesystem)
2842 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2843 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2844 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2845 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2846 help='"best", "aac" or "mp3"; best by default')
2847 parser.add_option_group(postproc)
2849 (opts, args) = parser.parse_args()
2851 # Open appropriate CookieJar
2852 if opts.cookiefile is None:
2853 jar = cookielib.CookieJar()
2856 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2857 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2859 except (IOError, OSError), err:
2860 sys.exit(u'ERROR: unable to open cookie file')
2863 if opts.dump_user_agent:
2864 print std_headers['User-Agent']
2867 # General configuration
2868 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2869 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2870 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2872 # Batch file verification
2874 if opts.batchfile is not None:
2876 if opts.batchfile == '-':
2879 batchfd = open(opts.batchfile, 'r')
2880 batchurls = batchfd.readlines()
2881 batchurls = [x.strip() for x in batchurls]
2882 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2884 sys.exit(u'ERROR: batch file could not be read')
2885 all_urls = batchurls + args
2887 # Conflicting, missing and erroneous options
2888 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2889 parser.error(u'using .netrc conflicts with giving username/password')
2890 if opts.password is not None and opts.username is None:
2891 parser.error(u'account username missing')
2892 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2893 parser.error(u'using output template conflicts with using title, literal title or auto number')
2894 if opts.usetitle and opts.useliteral:
2895 parser.error(u'using title conflicts with using literal title')
2896 if opts.username is not None and opts.password is None:
2897 opts.password = getpass.getpass(u'Type account password and press return:')
2898 if opts.ratelimit is not None:
2899 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2900 if numeric_limit is None:
2901 parser.error(u'invalid rate limit specified')
2902 opts.ratelimit = numeric_limit
2903 if opts.retries is not None:
2905 opts.retries = long(opts.retries)
2906 except (TypeError, ValueError), err:
2907 parser.error(u'invalid retry count specified')
2909 opts.playliststart = long(opts.playliststart)
2910 if opts.playliststart <= 0:
2912 except (TypeError, ValueError), err:
2913 parser.error(u'invalid playlist start number specified')
2915 opts.playlistend = long(opts.playlistend)
2916 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2918 except (TypeError, ValueError), err:
2919 parser.error(u'invalid playlist end number specified')
2920 if opts.extractaudio:
2921 if opts.audioformat not in ['best', 'aac', 'mp3']:
2922 parser.error(u'invalid audio format specified')
2924 # Information extractors
2925 youtube_ie = YoutubeIE()
2926 metacafe_ie = MetacafeIE(youtube_ie)
2927 dailymotion_ie = DailymotionIE()
2928 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2929 youtube_user_ie = YoutubeUserIE(youtube_ie)
2930 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2931 google_ie = GoogleIE()
2932 google_search_ie = GoogleSearchIE(google_ie)
2933 photobucket_ie = PhotobucketIE()
2934 yahoo_ie = YahooIE()
2935 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2936 deposit_files_ie = DepositFilesIE()
2937 facebook_ie = FacebookIE()
2938 generic_ie = GenericIE()
2941 fd = FileDownloader({
2942 'usenetrc': opts.usenetrc,
2943 'username': opts.username,
2944 'password': opts.password,
2945 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2946 'forceurl': opts.geturl,
2947 'forcetitle': opts.gettitle,
2948 'forcethumbnail': opts.getthumbnail,
2949 'forcedescription': opts.getdescription,
2950 'forcefilename': opts.getfilename,
2951 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2952 'format': opts.format,
2953 'format_limit': opts.format_limit,
2954 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2955 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2956 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2957 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2958 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2959 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2960 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2961 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2962 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2963 or u'%(id)s.%(ext)s'),
2964 'ignoreerrors': opts.ignoreerrors,
2965 'ratelimit': opts.ratelimit,
2966 'nooverwrites': opts.nooverwrites,
2967 'retries': opts.retries,
2968 'continuedl': opts.continue_dl,
2969 'noprogress': opts.noprogress,
2970 'playliststart': opts.playliststart,
2971 'playlistend': opts.playlistend,
2972 'logtostderr': opts.outtmpl == '-',
2973 'consoletitle': opts.consoletitle,
2974 'nopart': opts.nopart,
2975 'updatetime': opts.updatetime,
2977 fd.add_info_extractor(youtube_search_ie)
2978 fd.add_info_extractor(youtube_pl_ie)
2979 fd.add_info_extractor(youtube_user_ie)
2980 fd.add_info_extractor(metacafe_ie)
2981 fd.add_info_extractor(dailymotion_ie)
2982 fd.add_info_extractor(youtube_ie)
2983 fd.add_info_extractor(google_ie)
2984 fd.add_info_extractor(google_search_ie)
2985 fd.add_info_extractor(photobucket_ie)
2986 fd.add_info_extractor(yahoo_ie)
2987 fd.add_info_extractor(yahoo_search_ie)
2988 fd.add_info_extractor(deposit_files_ie)
2989 fd.add_info_extractor(facebook_ie)
2991 # This must come last since it's the
2992 # fallback if none of the others work
2993 fd.add_info_extractor(generic_ie)
2996 if opts.extractaudio:
2997 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3000 if opts.update_self:
3001 update_self(fd, sys.argv[0])
3004 if len(all_urls) < 1:
3005 if not opts.update_self:
3006 parser.error(u'you must provide at least one URL')
3009 retcode = fd.download(all_urls)
3011 # Dump cookie jar if requested
3012 if opts.cookiefile is not None:
3015 except (IOError, OSError), err:
3016 sys.exit(u'ERROR: unable to save cookie jar')
3020 except DownloadError:
3022 except SameFileError:
3023 sys.exit(u'ERROR: fixed output name but more than one file to download')
3024 except KeyboardInterrupt:
3025 sys.exit(u'\nERROR: Interrupted by user')