2 # -*- coding: utf-8 -*-
5 "Ricardo Garcia Gonzalez",
14 __license__ = "Public Domain"
39 # parse_qs was moved from the cgi module to the urlparse module recently.
41 from urlparse import parse_qs
43 from cgi import parse_qs
46 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
47 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
48 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 'Accept-Encoding': 'gzip, deflate',
50 'Accept-Language': 'en-us,en;q=0.5',
53 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
55 def preferredencoding():
56 """Get preferred encoding.
58 Returns the best encoding scheme for the system, based on
59 locale.getpreferredencoding() and some further tweaks.
61 def yield_preferredencoding():
63 pref = locale.getpreferredencoding()
69 return yield_preferredencoding().next()
71 def htmlentity_transform(matchobj):
72 """Transforms an HTML entity to a Unicode character.
74 This function receives a match object and is intended to be used with
75 the re.sub() function.
77 entity = matchobj.group(1)
79 # Known non-numeric HTML entity
80 if entity in htmlentitydefs.name2codepoint:
81 return unichr(htmlentitydefs.name2codepoint[entity])
84 mobj = re.match(ur'(?u)#(x?\d+)', entity)
86 numstr = mobj.group(1)
87 if numstr.startswith(u'x'):
89 numstr = u'0%s' % numstr
92 return unichr(long(numstr, base))
94 # Unknown entity in name, return its literal representation
95 return (u'&%s;' % entity)
97 def sanitize_title(utitle):
98 """Sanitizes a video title so it could be used as part of a filename."""
99 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
100 return utitle.replace(unicode(os.sep), u'%')
102 def sanitize_open(filename, open_mode):
103 """Try to open the given filename, and slightly tweak it if this fails.
105 Attempts to open the given filename. If this fails, it tries to change
106 the filename slightly, step by step, until it's either able to open it
107 or it fails and raises a final exception, like the standard open()
110 It returns the tuple (stream, definitive_file_name).
114 if sys.platform == 'win32':
116 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
117 return (sys.stdout, filename)
118 stream = open(filename, open_mode)
119 return (stream, filename)
120 except (IOError, OSError), err:
121 # In case of error, try to remove win32 forbidden chars
122 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
124 # An exception here should be caught in the caller
125 stream = open(filename, open_mode)
126 return (stream, filename)
128 def timeconvert(timestr):
129 """Convert RFC 2822 defined time string into system timestamp"""
131 timetuple = email.utils.parsedate_tz(timestr)
132 if timetuple is not None:
133 timestamp = email.utils.mktime_tz(timetuple)
136 class DownloadError(Exception):
137 """Download Error exception.
139 This exception may be thrown by FileDownloader objects if they are not
140 configured to continue on errors. They will contain the appropriate
145 class SameFileError(Exception):
146 """Same File exception.
148 This exception will be thrown by FileDownloader objects if they detect
149 multiple files would have to be downloaded to the same file on disk.
153 class PostProcessingError(Exception):
154 """Post Processing exception.
156 This exception may be raised by PostProcessor's .run() method to
157 indicate an error in the postprocessing task.
161 class UnavailableVideoError(Exception):
162 """Unavailable Format exception.
164 This exception will be thrown when a video is requested
165 in a format that is not available for that video.
169 class ContentTooShortError(Exception):
170 """Content Too Short exception.
172 This exception may be raised by FileDownloader objects when a file they
173 download is too small for what the server announced first, indicating
174 the connection was probably interrupted.
180 def __init__(self, downloaded, expected):
181 self.downloaded = downloaded
182 self.expected = expected
184 class YoutubeDLHandler(urllib2.HTTPHandler):
185 """Handler for HTTP requests and responses.
187 This class, when installed with an OpenerDirector, automatically adds
188 the standard headers to every HTTP request and handles gzipped and
189 deflated responses from web servers. If compression is to be avoided in
190 a particular request, the original request in the program code only has
191 to include the HTTP header "Youtubedl-No-Compression", which will be
192 removed before making the real request.
194 Part of this code was copied from:
196 http://techknack.net/python-urllib2-handlers/
198 Andrew Rowls, the author of that code, agreed to release it to the
205 return zlib.decompress(data, -zlib.MAX_WBITS)
207 return zlib.decompress(data)
210 def addinfourl_wrapper(stream, headers, url, code):
211 if hasattr(urllib2.addinfourl, 'getcode'):
212 return urllib2.addinfourl(stream, headers, url, code)
213 ret = urllib2.addinfourl(stream, headers, url)
217 def http_request(self, req):
218 for h in std_headers:
221 req.add_header(h, std_headers[h])
222 if 'Youtubedl-no-compression' in req.headers:
223 if 'Accept-encoding' in req.headers:
224 del req.headers['Accept-encoding']
225 del req.headers['Youtubedl-no-compression']
228 def http_response(self, req, resp):
231 if resp.headers.get('Content-encoding', '') == 'gzip':
232 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
233 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
234 resp.msg = old_resp.msg
236 if resp.headers.get('Content-encoding', '') == 'deflate':
237 gz = StringIO.StringIO(self.deflate(resp.read()))
238 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
239 resp.msg = old_resp.msg
242 class FileDownloader(object):
243 """File Downloader class.
245 File downloader objects are the ones responsible of downloading the
246 actual video file and writing it to disk if the user has requested
247 it, among some other tasks. In most cases there should be one per
248 program. As, given a video URL, the downloader doesn't know how to
249 extract all the needed information, task that InfoExtractors do, it
250 has to pass the URL to one of them.
252 For this, file downloader objects have a method that allows
253 InfoExtractors to be registered in a given order. When it is passed
254 a URL, the file downloader handles it to the first InfoExtractor it
255 finds that reports being able to handle it. The InfoExtractor extracts
256 all the information about the video or videos the URL refers to, and
257 asks the FileDownloader to process the video information, possibly
258 downloading the video.
260 File downloaders accept a lot of parameters. In order not to saturate
261 the object constructor with arguments, it receives a dictionary of
262 options instead. These options are available through the params
263 attribute for the InfoExtractors to use. The FileDownloader also
264 registers itself as the downloader in charge for the InfoExtractors
265 that are added to it, so this is a "mutual registration".
269 username: Username for authentication purposes.
270 password: Password for authentication purposes.
271 usenetrc: Use netrc for authentication instead.
272 quiet: Do not print messages to stdout.
273 forceurl: Force printing final URL.
274 forcetitle: Force printing title.
275 forcethumbnail: Force printing thumbnail URL.
276 forcedescription: Force printing description.
277 forcefilename: Force printing final filename.
278 simulate: Do not download the video files.
279 format: Video format code.
280 format_limit: Highest quality format to try.
281 outtmpl: Template for output names.
282 ignoreerrors: Do not stop on download errors.
283 ratelimit: Download speed limit, in bytes/sec.
284 nooverwrites: Prevent overwriting files.
285 retries: Number of times to retry for HTTP error 5xx
286 continuedl: Try to continue downloads if possible.
287 noprogress: Do not print the progress bar.
288 playliststart: Playlist item to start at.
289 playlistend: Playlist item to end at.
290 logtostderr: Log messages to stderr instead of stdout.
291 consoletitle: Display progress in console window's titlebar.
292 nopart: Do not use temporary .part files.
293 updatetime: Use the Last-modified header to set output file timestamps.
299 _download_retcode = None
300 _num_downloads = None
303 def __init__(self, params):
304 """Create a FileDownloader object with the given options."""
307 self._download_retcode = 0
308 self._num_downloads = 0
309 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
313 def pmkdir(filename):
314 """Create directory components in filename. Similar to Unix "mkdir -p"."""
315 components = filename.split(os.sep)
316 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
317 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
318 for dir in aggregate:
319 if not os.path.exists(dir):
323 def format_bytes(bytes):
326 if type(bytes) is str:
331 exponent = long(math.log(bytes, 1024.0))
332 suffix = 'bkMGTPEZY'[exponent]
333 converted = float(bytes) / float(1024**exponent)
334 return '%.2f%s' % (converted, suffix)
337 def calc_percent(byte_counter, data_len):
340 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
343 def calc_eta(start, now, total, current):
347 if current == 0 or dif < 0.001: # One millisecond
349 rate = float(current) / dif
350 eta = long((float(total) - float(current)) / rate)
351 (eta_mins, eta_secs) = divmod(eta, 60)
354 return '%02d:%02d' % (eta_mins, eta_secs)
357 def calc_speed(start, now, bytes):
359 if bytes == 0 or dif < 0.001: # One millisecond
360 return '%10s' % '---b/s'
361 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
364 def best_block_size(elapsed_time, bytes):
365 new_min = max(bytes / 2.0, 1.0)
366 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
367 if elapsed_time < 0.001:
369 rate = bytes / elapsed_time
377 def parse_bytes(bytestr):
378 """Parse a string indicating a byte quantity into a long integer."""
379 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
382 number = float(matchobj.group(1))
383 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
384 return long(round(number * multiplier))
386 def add_info_extractor(self, ie):
387 """Add an InfoExtractor object to the end of the list."""
389 ie.set_downloader(self)
391 def add_post_processor(self, pp):
392 """Add a PostProcessor object to the end of the chain."""
394 pp.set_downloader(self)
396 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
397 """Print message to stdout if not in quiet mode."""
399 if not self.params.get('quiet', False):
400 terminator = [u'\n', u''][skip_eol]
401 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
402 self._screen_file.flush()
403 except (UnicodeEncodeError), err:
404 if not ignore_encoding_errors:
407 def to_stderr(self, message):
408 """Print message to stderr."""
409 print >>sys.stderr, message.encode(preferredencoding())
411 def to_cons_title(self, message):
412 """Set console/terminal window title to message."""
413 if not self.params.get('consoletitle', False):
415 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
416 # c_wchar_p() might not be necessary if `message` is
417 # already of type unicode()
418 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
419 elif 'TERM' in os.environ:
420 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
422 def fixed_template(self):
423 """Checks if the output template is fixed."""
424 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
426 def trouble(self, message=None):
427 """Determine action to take when a download problem appears.
429 Depending on if the downloader has been configured to ignore
430 download errors or not, this method may throw an exception or
431 not when errors are found, after printing the message.
433 if message is not None:
434 self.to_stderr(message)
435 if not self.params.get('ignoreerrors', False):
436 raise DownloadError(message)
437 self._download_retcode = 1
439 def slow_down(self, start_time, byte_counter):
440 """Sleep if the download speed is over the rate limit."""
441 rate_limit = self.params.get('ratelimit', None)
442 if rate_limit is None or byte_counter == 0:
445 elapsed = now - start_time
448 speed = float(byte_counter) / elapsed
449 if speed > rate_limit:
450 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
452 def temp_name(self, filename):
453 """Returns a temporary filename for the given filename."""
454 if self.params.get('nopart', False) or filename == u'-' or \
455 (os.path.exists(filename) and not os.path.isfile(filename)):
457 return filename + u'.part'
459 def undo_temp_name(self, filename):
460 if filename.endswith(u'.part'):
461 return filename[:-len(u'.part')]
464 def try_rename(self, old_filename, new_filename):
466 if old_filename == new_filename:
468 os.rename(old_filename, new_filename)
469 except (IOError, OSError), err:
470 self.trouble(u'ERROR: unable to rename file')
472 def try_utime(self, filename, last_modified_hdr):
473 """Try to set the last-modified time of the given file."""
474 if last_modified_hdr is None:
476 if not os.path.isfile(filename):
478 timestr = last_modified_hdr
481 filetime = timeconvert(timestr)
485 os.utime(filename,(time.time(), filetime))
489 def report_destination(self, filename):
490 """Report destination filename."""
491 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
493 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
494 """Report download progress."""
495 if self.params.get('noprogress', False):
497 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
498 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
499 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
500 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
502 def report_resuming_byte(self, resume_len):
503 """Report attempt to resume at given byte."""
504 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
506 def report_retry(self, count, retries):
507 """Report retry in case of HTTP error 5xx"""
508 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
510 def report_file_already_downloaded(self, file_name):
511 """Report file has already been fully downloaded."""
513 self.to_screen(u'[download] %s has already been downloaded' % file_name)
514 except (UnicodeEncodeError), err:
515 self.to_screen(u'[download] The file has already been downloaded')
517 def report_unable_to_resume(self):
518 """Report it was impossible to resume download."""
519 self.to_screen(u'[download] Unable to resume')
521 def report_finish(self):
522 """Report download finished."""
523 if self.params.get('noprogress', False):
524 self.to_screen(u'[download] Download completed')
528 def increment_downloads(self):
529 """Increment the ordinal that assigns a number to each file."""
530 self._num_downloads += 1
532 def prepare_filename(self, info_dict):
533 """Generate the output filename."""
535 template_dict = dict(info_dict)
536 template_dict['epoch'] = unicode(long(time.time()))
537 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
538 filename = self.params['outtmpl'] % template_dict
540 except (ValueError, KeyError), err:
541 self.trouble(u'ERROR: invalid system charset or erroneous output template')
544 def process_info(self, info_dict):
545 """Process a single dictionary returned by an InfoExtractor."""
546 filename = self.prepare_filename(info_dict)
547 # Do nothing else if in simulate mode
548 if self.params.get('simulate', False):
550 if self.params.get('forcetitle', False):
551 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
552 if self.params.get('forceurl', False):
553 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
554 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
555 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
556 if self.params.get('forcedescription', False) and 'description' in info_dict:
557 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
558 if self.params.get('forcefilename', False) and filename is not None:
559 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
565 if self.params.get('nooverwrites', False) and os.path.exists(filename):
566 self.to_stderr(u'WARNING: file exists and will be skipped')
570 self.pmkdir(filename)
571 except (OSError, IOError), err:
572 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
576 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
577 except (OSError, IOError), err:
578 raise UnavailableVideoError
579 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
580 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
582 except (ContentTooShortError, ), err:
583 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
588 self.post_process(filename, info_dict)
589 except (PostProcessingError), err:
590 self.trouble(u'ERROR: postprocessing: %s' % str(err))
593 def download(self, url_list):
594 """Download a given list of URLs."""
595 if len(url_list) > 1 and self.fixed_template():
596 raise SameFileError(self.params['outtmpl'])
599 suitable_found = False
601 # Go to next InfoExtractor if not suitable
602 if not ie.suitable(url):
605 # Suitable InfoExtractor found
606 suitable_found = True
608 # Extract information from URL and process it
611 # Suitable InfoExtractor had been found; go to next URL
614 if not suitable_found:
615 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
617 return self._download_retcode
619 def post_process(self, filename, ie_info):
620 """Run the postprocessing chain on the given file."""
622 info['filepath'] = filename
628 def _download_with_rtmpdump(self, filename, url, player_url):
629 self.report_destination(filename)
630 tmpfilename = self.temp_name(filename)
632 # Check for rtmpdump first
634 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
635 except (OSError, IOError):
636 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
639 # Download using rtmpdump. rtmpdump returns exit code 2 when
640 # the connection was interrumpted and resuming appears to be
641 # possible. This is part of rtmpdump's normal usage, AFAIK.
642 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
643 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
644 while retval == 2 or retval == 1:
645 prevsize = os.path.getsize(tmpfilename)
646 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
647 time.sleep(5.0) # This seems to be needed
648 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
649 cursize = os.path.getsize(tmpfilename)
650 if prevsize == cursize and retval == 1:
653 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
654 self.try_rename(tmpfilename, filename)
657 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
660 def _do_download(self, filename, url, player_url):
661 # Check file already present
662 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
663 self.report_file_already_downloaded(filename)
666 # Attempt to download using rtmpdump
667 if url.startswith('rtmp'):
668 return self._download_with_rtmpdump(filename, url, player_url)
670 tmpfilename = self.temp_name(filename)
674 # Do not include the Accept-Encoding header
675 headers = {'Youtubedl-no-compression': 'True'}
676 basic_request = urllib2.Request(url, None, headers)
677 request = urllib2.Request(url, None, headers)
679 # Establish possible resume length
680 if os.path.isfile(tmpfilename):
681 resume_len = os.path.getsize(tmpfilename)
685 # Request parameters in case of being able to resume
686 if self.params.get('continuedl', False) and resume_len != 0:
687 self.report_resuming_byte(resume_len)
688 request.add_header('Range','bytes=%d-' % resume_len)
692 retries = self.params.get('retries', 0)
693 while count <= retries:
694 # Establish connection
696 data = urllib2.urlopen(request)
698 except (urllib2.HTTPError, ), err:
699 if (err.code < 500 or err.code >= 600) and err.code != 416:
700 # Unexpected HTTP error
702 elif err.code == 416:
703 # Unable to resume (requested range not satisfiable)
705 # Open the connection again without the range header
706 data = urllib2.urlopen(basic_request)
707 content_length = data.info()['Content-Length']
708 except (urllib2.HTTPError, ), err:
709 if err.code < 500 or err.code >= 600:
712 # Examine the reported length
713 if (content_length is not None and
714 (resume_len - 100 < long(content_length) < resume_len + 100)):
715 # The file had already been fully downloaded.
716 # Explanation to the above condition: in issue #175 it was revealed that
717 # YouTube sometimes adds or removes a few bytes from the end of the file,
718 # changing the file size slightly and causing problems for some users. So
719 # I decided to implement a suggested change and consider the file
720 # completely downloaded if the file size differs less than 100 bytes from
721 # the one in the hard drive.
722 self.report_file_already_downloaded(filename)
723 self.try_rename(tmpfilename, filename)
726 # The length does not match, we start the download over
727 self.report_unable_to_resume()
733 self.report_retry(count, retries)
736 self.trouble(u'ERROR: giving up after %s retries' % retries)
739 data_len = data.info().get('Content-length', None)
740 if data_len is not None:
741 data_len = long(data_len) + resume_len
742 data_len_str = self.format_bytes(data_len)
743 byte_counter = 0 + resume_len
749 data_block = data.read(block_size)
751 if len(data_block) == 0:
753 byte_counter += len(data_block)
755 # Open file just in time
758 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
759 filename = self.undo_temp_name(tmpfilename)
760 self.report_destination(filename)
761 except (OSError, IOError), err:
762 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
765 stream.write(data_block)
766 except (IOError, OSError), err:
767 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
769 block_size = self.best_block_size(after - before, len(data_block))
772 percent_str = self.calc_percent(byte_counter, data_len)
773 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
774 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
775 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
778 self.slow_down(start, byte_counter - resume_len)
782 if data_len is not None and byte_counter != data_len:
783 raise ContentTooShortError(byte_counter, long(data_len))
784 self.try_rename(tmpfilename, filename)
786 # Update file modification time
787 if self.params.get('updatetime', True):
788 self.try_utime(filename, data.info().get('last-modified', None))
792 class InfoExtractor(object):
793 """Information Extractor class.
795 Information extractors are the classes that, given a URL, extract
796 information from the video (or videos) the URL refers to. This
797 information includes the real video URL, the video title and simplified
798 title, author and others. The information is stored in a dictionary
799 which is then passed to the FileDownloader. The FileDownloader
800 processes this information possibly downloading the video to the file
801 system, among other possible outcomes. The dictionaries must include
802 the following fields:
804 id: Video identifier.
805 url: Final video URL.
806 uploader: Nickname of the video uploader.
807 title: Literal title.
808 stitle: Simplified title.
809 ext: Video filename extension.
810 format: Video format.
811 player_url: SWF Player URL (may be None).
813 The following fields are optional. Their primary purpose is to allow
814 youtube-dl to serve as the backend for a video search function, such
815 as the one in youtube2mp3. They are only used when their respective
816 forced printing functions are called:
818 thumbnail: Full URL to a video thumbnail image.
819 description: One-line video description.
821 Subclasses of this one should re-define the _real_initialize() and
822 _real_extract() methods, as well as the suitable() static method.
823 Probably, they should also be instantiated and added to the main
830 def __init__(self, downloader=None):
831 """Constructor. Receives an optional downloader."""
833 self.set_downloader(downloader)
837 """Receives a URL and returns True if suitable for this IE."""
840 def initialize(self):
841 """Initializes an instance (authentication, etc)."""
843 self._real_initialize()
846 def extract(self, url):
847 """Extracts URL information and returns it in list of dicts."""
849 return self._real_extract(url)
851 def set_downloader(self, downloader):
852 """Sets the downloader for this IE."""
853 self._downloader = downloader
855 def _real_initialize(self):
856 """Real initialization process. Redefine in subclasses."""
859 def _real_extract(self, url):
860 """Real extraction process. Redefine in subclasses."""
863 class YoutubeIE(InfoExtractor):
864 """Information extractor for youtube.com."""
866 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
867 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
868 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
869 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
870 _NETRC_MACHINE = 'youtube'
871 # Listed in order of quality
872 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
873 _video_extensions = {
879 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
886 return (re.match(YoutubeIE._VALID_URL, url) is not None)
888 def report_lang(self):
889 """Report attempt to set language."""
890 self._downloader.to_screen(u'[youtube] Setting language')
892 def report_login(self):
893 """Report attempt to log in."""
894 self._downloader.to_screen(u'[youtube] Logging in')
896 def report_age_confirmation(self):
897 """Report attempt to confirm age."""
898 self._downloader.to_screen(u'[youtube] Confirming age')
900 def report_video_webpage_download(self, video_id):
901 """Report attempt to download video webpage."""
902 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
904 def report_video_info_webpage_download(self, video_id):
905 """Report attempt to download video info webpage."""
906 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
908 def report_information_extraction(self, video_id):
909 """Report attempt to extract video information."""
910 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
912 def report_unavailable_format(self, video_id, format):
913 """Report extracted video URL."""
914 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
916 def report_rtmp_download(self):
917 """Indicate the download will use the RTMP protocol."""
918 self._downloader.to_screen(u'[youtube] RTMP download detected')
920 def _real_initialize(self):
921 if self._downloader is None:
926 downloader_params = self._downloader.params
928 # Attempt to use provided username and password or .netrc data
929 if downloader_params.get('username', None) is not None:
930 username = downloader_params['username']
931 password = downloader_params['password']
932 elif downloader_params.get('usenetrc', False):
934 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
939 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
940 except (IOError, netrc.NetrcParseError), err:
941 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
945 request = urllib2.Request(self._LANG_URL)
948 urllib2.urlopen(request).read()
949 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
950 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
953 # No authentication to be performed
959 'current_form': 'loginForm',
961 'action_login': 'Log In',
962 'username': username,
963 'password': password,
965 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
968 login_results = urllib2.urlopen(request).read()
969 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
970 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
972 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
973 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
979 'action_confirm': 'Confirm',
981 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
983 self.report_age_confirmation()
984 age_results = urllib2.urlopen(request).read()
985 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
986 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
989 def _real_extract(self, url):
990 # Extract video id from URL
991 mobj = re.match(self._VALID_URL, url)
993 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
995 video_id = mobj.group(2)
998 self.report_video_webpage_download(video_id)
999 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1001 video_webpage = urllib2.urlopen(request).read()
1002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1006 # Attempt to extract SWF player URL
1007 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1008 if mobj is not None:
1009 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1014 self.report_video_info_webpage_download(video_id)
1015 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1016 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1017 % (video_id, el_type))
1018 request = urllib2.Request(video_info_url)
1020 video_info_webpage = urllib2.urlopen(request).read()
1021 video_info = parse_qs(video_info_webpage)
1022 if 'token' in video_info:
1024 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1025 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1027 if 'token' not in video_info:
1028 if 'reason' in video_info:
1029 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1031 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1034 # Start extracting information
1035 self.report_information_extraction(video_id)
1038 if 'author' not in video_info:
1039 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1041 video_uploader = urllib.unquote_plus(video_info['author'][0])
1044 if 'title' not in video_info:
1045 self._downloader.trouble(u'ERROR: unable to extract video title')
1047 video_title = urllib.unquote_plus(video_info['title'][0])
1048 video_title = video_title.decode('utf-8')
1049 video_title = sanitize_title(video_title)
1052 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1053 simple_title = simple_title.strip(ur'_')
1056 if 'thumbnail_url' not in video_info:
1057 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1058 video_thumbnail = ''
1059 else: # don't panic if we can't find it
1060 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1064 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1065 if mobj is not None:
1066 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1067 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1068 for expression in format_expressions:
1070 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1075 video_description = 'No description available.'
1076 if self._downloader.params.get('forcedescription', False):
1077 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1078 if mobj is not None:
1079 video_description = mobj.group(1)
1082 video_token = urllib.unquote_plus(video_info['token'][0])
1084 # Decide which formats to download
1085 req_format = self._downloader.params.get('format', None)
1087 if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1088 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1089 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1090 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1091 format_limit = self._downloader.params.get('format_limit', None)
1092 if format_limit is not None and format_limit in self._available_formats:
1093 format_list = self._available_formats[self._available_formats.index(format_limit):]
1095 format_list = self._available_formats
1096 existing_formats = [x for x in format_list if x in url_map]
1097 if len(existing_formats) == 0:
1098 self._downloader.trouble(u'ERROR: no known formats available for video')
1100 if req_format is None:
1101 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1102 elif req_format == '-1':
1103 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1106 if req_format not in url_map:
1107 self._downloader.trouble(u'ERROR: requested format not available')
1109 video_url_list = [(req_format, url_map[req_format])] # Specific format
1111 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1112 self.report_rtmp_download()
1113 video_url_list = [(None, video_info['conn'][0])]
1116 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1119 for format_param, video_real_url in video_url_list:
1120 # At this point we have a new video
1121 self._downloader.increment_downloads()
1124 video_extension = self._video_extensions.get(format_param, 'flv')
1126 # Find the video URL in fmt_url_map or conn paramters
1128 # Process video information
1129 self._downloader.process_info({
1130 'id': video_id.decode('utf-8'),
1131 'url': video_real_url.decode('utf-8'),
1132 'uploader': video_uploader.decode('utf-8'),
1133 'upload_date': upload_date,
1134 'title': video_title,
1135 'stitle': simple_title,
1136 'ext': video_extension.decode('utf-8'),
1137 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1138 'thumbnail': video_thumbnail.decode('utf-8'),
1139 'description': video_description.decode('utf-8'),
1140 'player_url': player_url,
1142 except UnavailableVideoError, err:
1143 self._downloader.trouble(u'\nERROR: unable to download video')
1146 class MetacafeIE(InfoExtractor):
1147 """Information Extractor for metacafe.com."""
1149 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1150 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1151 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1154 def __init__(self, youtube_ie, downloader=None):
1155 InfoExtractor.__init__(self, downloader)
1156 self._youtube_ie = youtube_ie
1160 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1162 def report_disclaimer(self):
1163 """Report disclaimer retrieval."""
1164 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1166 def report_age_confirmation(self):
1167 """Report attempt to confirm age."""
1168 self._downloader.to_screen(u'[metacafe] Confirming age')
1170 def report_download_webpage(self, video_id):
1171 """Report webpage download."""
1172 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1174 def report_extraction(self, video_id):
1175 """Report information extraction."""
1176 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1178 def _real_initialize(self):
1179 # Retrieve disclaimer
1180 request = urllib2.Request(self._DISCLAIMER)
1182 self.report_disclaimer()
1183 disclaimer = urllib2.urlopen(request).read()
1184 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1185 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1191 'submit': "Continue - I'm over 18",
1193 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1195 self.report_age_confirmation()
1196 disclaimer = urllib2.urlopen(request).read()
1197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1198 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1201 def _real_extract(self, url):
1202 # Extract id and simplified title from URL
1203 mobj = re.match(self._VALID_URL, url)
1205 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1208 video_id = mobj.group(1)
1210 # Check if video comes from YouTube
1211 mobj2 = re.match(r'^yt-(.*)$', video_id)
1212 if mobj2 is not None:
1213 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1216 # At this point we have a new video
1217 self._downloader.increment_downloads()
1219 simple_title = mobj.group(2).decode('utf-8')
1221 # Retrieve video webpage to extract further information
1222 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1224 self.report_download_webpage(video_id)
1225 webpage = urllib2.urlopen(request).read()
1226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1230 # Extract URL, uploader and title from webpage
1231 self.report_extraction(video_id)
1232 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1233 if mobj is not None:
1234 mediaURL = urllib.unquote(mobj.group(1))
1235 video_extension = mediaURL[-3:]
1237 # Extract gdaKey if available
1238 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1240 video_url = mediaURL
1242 gdaKey = mobj.group(1)
1243 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1245 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1247 self._downloader.trouble(u'ERROR: unable to extract media URL')
1249 vardict = parse_qs(mobj.group(1))
1250 if 'mediaData' not in vardict:
1251 self._downloader.trouble(u'ERROR: unable to extract media URL')
1253 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1255 self._downloader.trouble(u'ERROR: unable to extract media URL')
1257 mediaURL = mobj.group(1).replace('\\/', '/')
1258 video_extension = mediaURL[-3:]
1259 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1261 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1263 self._downloader.trouble(u'ERROR: unable to extract title')
1265 video_title = mobj.group(1).decode('utf-8')
1266 video_title = sanitize_title(video_title)
1268 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1270 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1272 video_uploader = mobj.group(1)
1275 # Process video information
1276 self._downloader.process_info({
1277 'id': video_id.decode('utf-8'),
1278 'url': video_url.decode('utf-8'),
1279 'uploader': video_uploader.decode('utf-8'),
1280 'upload_date': u'NA',
1281 'title': video_title,
1282 'stitle': simple_title,
1283 'ext': video_extension.decode('utf-8'),
1287 except UnavailableVideoError:
1288 self._downloader.trouble(u'\nERROR: unable to download video')
1291 class DailymotionIE(InfoExtractor):
1292 """Information Extractor for Dailymotion"""
1294 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1296 def __init__(self, downloader=None):
1297 InfoExtractor.__init__(self, downloader)
1301 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1303 def report_download_webpage(self, video_id):
1304 """Report webpage download."""
1305 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1307 def report_extraction(self, video_id):
1308 """Report information extraction."""
1309 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1311 def _real_initialize(self):
1314 def _real_extract(self, url):
1315 # Extract id and simplified title from URL
1316 mobj = re.match(self._VALID_URL, url)
1318 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1321 # At this point we have a new video
1322 self._downloader.increment_downloads()
1323 video_id = mobj.group(1)
1325 simple_title = mobj.group(2).decode('utf-8')
1326 video_extension = 'flv'
1328 # Retrieve video webpage to extract further information
1329 request = urllib2.Request(url)
1331 self.report_download_webpage(video_id)
1332 webpage = urllib2.urlopen(request).read()
1333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1337 # Extract URL, uploader and title from webpage
1338 self.report_extraction(video_id)
1339 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1341 self._downloader.trouble(u'ERROR: unable to extract media URL')
1343 mediaURL = urllib.unquote(mobj.group(1))
1345 # if needed add http://www.dailymotion.com/ if relative URL
1347 video_url = mediaURL
1349 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1350 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1352 self._downloader.trouble(u'ERROR: unable to extract title')
1354 video_title = mobj.group(1).decode('utf-8')
1355 video_title = sanitize_title(video_title)
1357 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1359 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1361 video_uploader = mobj.group(1)
1364 # Process video information
1365 self._downloader.process_info({
1366 'id': video_id.decode('utf-8'),
1367 'url': video_url.decode('utf-8'),
1368 'uploader': video_uploader.decode('utf-8'),
1369 'upload_date': u'NA',
1370 'title': video_title,
1371 'stitle': simple_title,
1372 'ext': video_extension.decode('utf-8'),
1376 except UnavailableVideoError:
1377 self._downloader.trouble(u'\nERROR: unable to download video')
1379 class GoogleIE(InfoExtractor):
1380 """Information extractor for video.google.com."""
1382 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1384 def __init__(self, downloader=None):
1385 InfoExtractor.__init__(self, downloader)
1389 return (re.match(GoogleIE._VALID_URL, url) is not None)
1391 def report_download_webpage(self, video_id):
1392 """Report webpage download."""
1393 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1395 def report_extraction(self, video_id):
1396 """Report information extraction."""
1397 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1399 def _real_initialize(self):
1402 def _real_extract(self, url):
1403 # Extract id from URL
1404 mobj = re.match(self._VALID_URL, url)
1406 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1409 # At this point we have a new video
1410 self._downloader.increment_downloads()
1411 video_id = mobj.group(1)
1413 video_extension = 'mp4'
1415 # Retrieve video webpage to extract further information
1416 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1418 self.report_download_webpage(video_id)
1419 webpage = urllib2.urlopen(request).read()
1420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1421 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1424 # Extract URL, uploader, and title from webpage
1425 self.report_extraction(video_id)
1426 mobj = re.search(r"download_url:'([^']+)'", webpage)
1428 video_extension = 'flv'
1429 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1431 self._downloader.trouble(u'ERROR: unable to extract media URL')
1433 mediaURL = urllib.unquote(mobj.group(1))
1434 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1435 mediaURL = mediaURL.replace('\\x26', '\x26')
1437 video_url = mediaURL
1439 mobj = re.search(r'<title>(.*)</title>', webpage)
1441 self._downloader.trouble(u'ERROR: unable to extract title')
1443 video_title = mobj.group(1).decode('utf-8')
1444 video_title = sanitize_title(video_title)
1445 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1447 # Extract video description
1448 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1450 self._downloader.trouble(u'ERROR: unable to extract video description')
1452 video_description = mobj.group(1).decode('utf-8')
1453 if not video_description:
1454 video_description = 'No description available.'
1456 # Extract video thumbnail
1457 if self._downloader.params.get('forcethumbnail', False):
1458 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1460 webpage = urllib2.urlopen(request).read()
1461 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1462 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1464 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1466 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1468 video_thumbnail = mobj.group(1)
1469 else: # we need something to pass to process_info
1470 video_thumbnail = ''
1474 # Process video information
1475 self._downloader.process_info({
1476 'id': video_id.decode('utf-8'),
1477 'url': video_url.decode('utf-8'),
1479 'upload_date': u'NA',
1480 'title': video_title,
1481 'stitle': simple_title,
1482 'ext': video_extension.decode('utf-8'),
1486 except UnavailableVideoError:
1487 self._downloader.trouble(u'\nERROR: unable to download video')
1490 class PhotobucketIE(InfoExtractor):
1491 """Information extractor for photobucket.com."""
1493 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1500 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1502 def report_download_webpage(self, video_id):
1503 """Report webpage download."""
1504 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1506 def report_extraction(self, video_id):
1507 """Report information extraction."""
1508 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1510 def _real_initialize(self):
1513 def _real_extract(self, url):
1514 # Extract id from URL
1515 mobj = re.match(self._VALID_URL, url)
1517 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1520 # At this point we have a new video
1521 self._downloader.increment_downloads()
1522 video_id = mobj.group(1)
1524 video_extension = 'flv'
1526 # Retrieve video webpage to extract further information
1527 request = urllib2.Request(url)
1529 self.report_download_webpage(video_id)
1530 webpage = urllib2.urlopen(request).read()
1531 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1532 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1535 # Extract URL, uploader, and title from webpage
1536 self.report_extraction(video_id)
1537 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1539 self._downloader.trouble(u'ERROR: unable to extract media URL')
1541 mediaURL = urllib.unquote(mobj.group(1))
1543 video_url = mediaURL
1545 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1547 self._downloader.trouble(u'ERROR: unable to extract title')
1549 video_title = mobj.group(1).decode('utf-8')
1550 video_title = sanitize_title(video_title)
1551 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1553 video_uploader = mobj.group(2).decode('utf-8')
1556 # Process video information
1557 self._downloader.process_info({
1558 'id': video_id.decode('utf-8'),
1559 'url': video_url.decode('utf-8'),
1560 'uploader': video_uploader,
1561 'upload_date': u'NA',
1562 'title': video_title,
1563 'stitle': simple_title,
1564 'ext': video_extension.decode('utf-8'),
1568 except UnavailableVideoError:
1569 self._downloader.trouble(u'\nERROR: unable to download video')
1572 class YahooIE(InfoExtractor):
1573 """Information extractor for video.yahoo.com."""
1575 # _VALID_URL matches all Yahoo! Video URLs
1576 # _VPAGE_URL matches only the extractable '/watch/' URLs
1577 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1578 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1580 def __init__(self, downloader=None):
1581 InfoExtractor.__init__(self, downloader)
1585 return (re.match(YahooIE._VALID_URL, url) is not None)
1587 def report_download_webpage(self, video_id):
1588 """Report webpage download."""
1589 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1591 def report_extraction(self, video_id):
1592 """Report information extraction."""
1593 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1595 def _real_initialize(self):
1598 def _real_extract(self, url, new_video=True):
1599 # Extract ID from URL
1600 mobj = re.match(self._VALID_URL, url)
1602 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1605 # At this point we have a new video
1606 self._downloader.increment_downloads()
1607 video_id = mobj.group(2)
1608 video_extension = 'flv'
1610 # Rewrite valid but non-extractable URLs as
1611 # extractable English language /watch/ URLs
1612 if re.match(self._VPAGE_URL, url) is None:
1613 request = urllib2.Request(url)
1615 webpage = urllib2.urlopen(request).read()
1616 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1617 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1620 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1622 self._downloader.trouble(u'ERROR: Unable to extract id field')
1624 yahoo_id = mobj.group(1)
1626 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1628 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1630 yahoo_vid = mobj.group(1)
1632 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1633 return self._real_extract(url, new_video=False)
1635 # Retrieve video webpage to extract further information
1636 request = urllib2.Request(url)
1638 self.report_download_webpage(video_id)
1639 webpage = urllib2.urlopen(request).read()
1640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1641 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1644 # Extract uploader and title from webpage
1645 self.report_extraction(video_id)
1646 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1648 self._downloader.trouble(u'ERROR: unable to extract video title')
1650 video_title = mobj.group(1).decode('utf-8')
1651 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1653 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1655 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1657 video_uploader = mobj.group(1).decode('utf-8')
1659 # Extract video thumbnail
1660 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1662 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1664 video_thumbnail = mobj.group(1).decode('utf-8')
1666 # Extract video description
1667 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1669 self._downloader.trouble(u'ERROR: unable to extract video description')
1671 video_description = mobj.group(1).decode('utf-8')
1672 if not video_description: video_description = 'No description available.'
1674 # Extract video height and width
1675 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1677 self._downloader.trouble(u'ERROR: unable to extract video height')
1679 yv_video_height = mobj.group(1)
1681 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1683 self._downloader.trouble(u'ERROR: unable to extract video width')
1685 yv_video_width = mobj.group(1)
1687 # Retrieve video playlist to extract media URL
1688 # I'm not completely sure what all these options are, but we
1689 # seem to need most of them, otherwise the server sends a 401.
1690 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1691 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1692 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1693 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1694 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1696 self.report_download_webpage(video_id)
1697 webpage = urllib2.urlopen(request).read()
1698 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1699 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1702 # Extract media URL from playlist XML
1703 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1705 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1707 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1708 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1711 # Process video information
1712 self._downloader.process_info({
1713 'id': video_id.decode('utf-8'),
1715 'uploader': video_uploader,
1716 'upload_date': u'NA',
1717 'title': video_title,
1718 'stitle': simple_title,
1719 'ext': video_extension.decode('utf-8'),
1720 'thumbnail': video_thumbnail.decode('utf-8'),
1721 'description': video_description,
1722 'thumbnail': video_thumbnail,
1723 'description': video_description,
1726 except UnavailableVideoError:
1727 self._downloader.trouble(u'\nERROR: unable to download video')
1730 class GenericIE(InfoExtractor):
1731 """Generic last-resort information extractor."""
1733 def __init__(self, downloader=None):
1734 InfoExtractor.__init__(self, downloader)
1740 def report_download_webpage(self, video_id):
1741 """Report webpage download."""
1742 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1743 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1745 def report_extraction(self, video_id):
1746 """Report information extraction."""
1747 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1749 def _real_initialize(self):
1752 def _real_extract(self, url):
1753 # At this point we have a new video
1754 self._downloader.increment_downloads()
1756 video_id = url.split('/')[-1]
1757 request = urllib2.Request(url)
1759 self.report_download_webpage(video_id)
1760 webpage = urllib2.urlopen(request).read()
1761 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1762 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1764 except ValueError, err:
1765 # since this is the last-resort InfoExtractor, if
1766 # this error is thrown, it'll be thrown here
1767 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1770 self.report_extraction(video_id)
1771 # Start with something easy: JW Player in SWFObject
1772 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1774 # Broaden the search a little bit
1775 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1777 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1780 # It's possible that one of the regexes
1781 # matched, but returned an empty group:
1782 if mobj.group(1) is None:
1783 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1786 video_url = urllib.unquote(mobj.group(1))
1787 video_id = os.path.basename(video_url)
1789 # here's a fun little line of code for you:
1790 video_extension = os.path.splitext(video_id)[1][1:]
1791 video_id = os.path.splitext(video_id)[0]
1793 # it's tempting to parse this further, but you would
1794 # have to take into account all the variations like
1795 # Video Title - Site Name
1796 # Site Name | Video Title
1797 # Video Title - Tagline | Site Name
1798 # and so on and so forth; it's just not practical
1799 mobj = re.search(r'<title>(.*)</title>', webpage)
1801 self._downloader.trouble(u'ERROR: unable to extract title')
1803 video_title = mobj.group(1).decode('utf-8')
1804 video_title = sanitize_title(video_title)
1805 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1807 # video uploader is domain name
1808 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1810 self._downloader.trouble(u'ERROR: unable to extract title')
1812 video_uploader = mobj.group(1).decode('utf-8')
1815 # Process video information
1816 self._downloader.process_info({
1817 'id': video_id.decode('utf-8'),
1818 'url': video_url.decode('utf-8'),
1819 'uploader': video_uploader,
1820 'upload_date': u'NA',
1821 'title': video_title,
1822 'stitle': simple_title,
1823 'ext': video_extension.decode('utf-8'),
1827 except UnavailableVideoError, err:
1828 self._downloader.trouble(u'\nERROR: unable to download video')
1831 class YoutubeSearchIE(InfoExtractor):
1832 """Information Extractor for YouTube search queries."""
1833 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1834 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1835 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1836 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1838 _max_youtube_results = 1000
1840 def __init__(self, youtube_ie, downloader=None):
1841 InfoExtractor.__init__(self, downloader)
1842 self._youtube_ie = youtube_ie
1846 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1848 def report_download_page(self, query, pagenum):
1849 """Report attempt to download playlist page with given number."""
1850 query = query.decode(preferredencoding())
1851 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1853 def _real_initialize(self):
1854 self._youtube_ie.initialize()
1856 def _real_extract(self, query):
1857 mobj = re.match(self._VALID_QUERY, query)
1859 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1862 prefix, query = query.split(':')
1864 query = query.encode('utf-8')
1866 self._download_n_results(query, 1)
1868 elif prefix == 'all':
1869 self._download_n_results(query, self._max_youtube_results)
1875 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1877 elif n > self._max_youtube_results:
1878 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1879 n = self._max_youtube_results
1880 self._download_n_results(query, n)
1882 except ValueError: # parsing prefix as integer fails
1883 self._download_n_results(query, 1)
1886 def _download_n_results(self, query, n):
1887 """Downloads a specified number of results for a query"""
1890 already_seen = set()
1894 self.report_download_page(query, pagenum)
1895 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1896 request = urllib2.Request(result_url)
1898 page = urllib2.urlopen(request).read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1903 # Extract video identifiers
1904 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1905 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1906 if video_id not in already_seen:
1907 video_ids.append(video_id)
1908 already_seen.add(video_id)
1909 if len(video_ids) == n:
1910 # Specified n videos reached
1911 for id in video_ids:
1912 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1915 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1916 for id in video_ids:
1917 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1920 pagenum = pagenum + 1
1922 class GoogleSearchIE(InfoExtractor):
1923 """Information Extractor for Google Video search queries."""
1924 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1925 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1926 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1927 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1929 _max_google_results = 1000
1931 def __init__(self, google_ie, downloader=None):
1932 InfoExtractor.__init__(self, downloader)
1933 self._google_ie = google_ie
1937 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1939 def report_download_page(self, query, pagenum):
1940 """Report attempt to download playlist page with given number."""
1941 query = query.decode(preferredencoding())
1942 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1944 def _real_initialize(self):
1945 self._google_ie.initialize()
1947 def _real_extract(self, query):
1948 mobj = re.match(self._VALID_QUERY, query)
1950 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1953 prefix, query = query.split(':')
1955 query = query.encode('utf-8')
1957 self._download_n_results(query, 1)
1959 elif prefix == 'all':
1960 self._download_n_results(query, self._max_google_results)
1966 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1968 elif n > self._max_google_results:
1969 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1970 n = self._max_google_results
1971 self._download_n_results(query, n)
1973 except ValueError: # parsing prefix as integer fails
1974 self._download_n_results(query, 1)
1977 def _download_n_results(self, query, n):
1978 """Downloads a specified number of results for a query"""
1981 already_seen = set()
1985 self.report_download_page(query, pagenum)
1986 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1987 request = urllib2.Request(result_url)
1989 page = urllib2.urlopen(request).read()
1990 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1991 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1994 # Extract video identifiers
1995 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1996 video_id = mobj.group(1)
1997 if video_id not in already_seen:
1998 video_ids.append(video_id)
1999 already_seen.add(video_id)
2000 if len(video_ids) == n:
2001 # Specified n videos reached
2002 for id in video_ids:
2003 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2006 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2007 for id in video_ids:
2008 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2011 pagenum = pagenum + 1
2013 class YahooSearchIE(InfoExtractor):
2014 """Information Extractor for Yahoo! Video search queries."""
2015 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2016 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2017 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2018 _MORE_PAGES_INDICATOR = r'\s*Next'
2020 _max_yahoo_results = 1000
2022 def __init__(self, yahoo_ie, downloader=None):
2023 InfoExtractor.__init__(self, downloader)
2024 self._yahoo_ie = yahoo_ie
2028 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2030 def report_download_page(self, query, pagenum):
2031 """Report attempt to download playlist page with given number."""
2032 query = query.decode(preferredencoding())
2033 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2035 def _real_initialize(self):
2036 self._yahoo_ie.initialize()
2038 def _real_extract(self, query):
2039 mobj = re.match(self._VALID_QUERY, query)
2041 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2044 prefix, query = query.split(':')
2046 query = query.encode('utf-8')
2048 self._download_n_results(query, 1)
2050 elif prefix == 'all':
2051 self._download_n_results(query, self._max_yahoo_results)
2057 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2059 elif n > self._max_yahoo_results:
2060 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2061 n = self._max_yahoo_results
2062 self._download_n_results(query, n)
2064 except ValueError: # parsing prefix as integer fails
2065 self._download_n_results(query, 1)
2068 def _download_n_results(self, query, n):
2069 """Downloads a specified number of results for a query"""
2072 already_seen = set()
2076 self.report_download_page(query, pagenum)
2077 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2078 request = urllib2.Request(result_url)
2080 page = urllib2.urlopen(request).read()
2081 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2082 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2085 # Extract video identifiers
2086 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2087 video_id = mobj.group(1)
2088 if video_id not in already_seen:
2089 video_ids.append(video_id)
2090 already_seen.add(video_id)
2091 if len(video_ids) == n:
2092 # Specified n videos reached
2093 for id in video_ids:
2094 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2097 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2098 for id in video_ids:
2099 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2102 pagenum = pagenum + 1
2104 class YoutubePlaylistIE(InfoExtractor):
2105 """Information Extractor for YouTube playlists."""
2107 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2108 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2109 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2110 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2113 def __init__(self, youtube_ie, downloader=None):
2114 InfoExtractor.__init__(self, downloader)
2115 self._youtube_ie = youtube_ie
2119 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2121 def report_download_page(self, playlist_id, pagenum):
2122 """Report attempt to download playlist page with given number."""
2123 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2125 def _real_initialize(self):
2126 self._youtube_ie.initialize()
2128 def _real_extract(self, url):
2129 # Extract playlist id
2130 mobj = re.match(self._VALID_URL, url)
2132 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2136 if mobj.group(3) is not None:
2137 self._youtube_ie.extract(mobj.group(3))
2140 # Download playlist pages
2141 # prefix is 'p' as default for playlists but there are other types that need extra care
2142 playlist_prefix = mobj.group(1)
2143 if playlist_prefix == 'a':
2144 playlist_access = 'artist'
2146 playlist_prefix = 'p'
2147 playlist_access = 'view_play_list'
2148 playlist_id = mobj.group(2)
2153 self.report_download_page(playlist_id, pagenum)
2154 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2156 page = urllib2.urlopen(request).read()
2157 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2158 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2161 # Extract video identifiers
2163 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2164 if mobj.group(1) not in ids_in_page:
2165 ids_in_page.append(mobj.group(1))
2166 video_ids.extend(ids_in_page)
2168 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2170 pagenum = pagenum + 1
2172 playliststart = self._downloader.params.get('playliststart', 1) - 1
2173 playlistend = self._downloader.params.get('playlistend', -1)
2174 video_ids = video_ids[playliststart:playlistend]
2176 for id in video_ids:
2177 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2180 class YoutubeUserIE(InfoExtractor):
2181 """Information Extractor for YouTube users."""
2183 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2184 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2185 _GDATA_PAGE_SIZE = 50
2186 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2187 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2190 def __init__(self, youtube_ie, downloader=None):
2191 InfoExtractor.__init__(self, downloader)
2192 self._youtube_ie = youtube_ie
2196 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2198 def report_download_page(self, username, start_index):
2199 """Report attempt to download user page."""
2200 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2201 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2203 def _real_initialize(self):
2204 self._youtube_ie.initialize()
2206 def _real_extract(self, url):
2208 mobj = re.match(self._VALID_URL, url)
2210 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2213 username = mobj.group(1)
2215 # Download video ids using YouTube Data API. Result size per
2216 # query is limited (currently to 50 videos) so we need to query
2217 # page by page until there are no video ids - it means we got
2224 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2225 self.report_download_page(username, start_index)
2227 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2230 page = urllib2.urlopen(request).read()
2231 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2232 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2235 # Extract video identifiers
2238 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2239 if mobj.group(1) not in ids_in_page:
2240 ids_in_page.append(mobj.group(1))
2242 video_ids.extend(ids_in_page)
2244 # A little optimization - if current page is not
2245 # "full", ie. does not contain PAGE_SIZE video ids then
2246 # we can assume that this page is the last one - there
2247 # are no more ids on further pages - no need to query
2250 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2255 all_ids_count = len(video_ids)
2256 playliststart = self._downloader.params.get('playliststart', 1) - 1
2257 playlistend = self._downloader.params.get('playlistend', -1)
2259 if playlistend == -1:
2260 video_ids = video_ids[playliststart:]
2262 video_ids = video_ids[playliststart:playlistend]
2264 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2265 (username, all_ids_count, len(video_ids)))
2267 for video_id in video_ids:
2268 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2271 class DepositFilesIE(InfoExtractor):
2272 """Information extractor for depositfiles.com"""
2274 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2276 def __init__(self, downloader=None):
2277 InfoExtractor.__init__(self, downloader)
2281 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2283 def report_download_webpage(self, file_id):
2284 """Report webpage download."""
2285 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2287 def report_extraction(self, file_id):
2288 """Report information extraction."""
2289 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2291 def _real_initialize(self):
2294 def _real_extract(self, url):
2295 # At this point we have a new file
2296 self._downloader.increment_downloads()
2298 file_id = url.split('/')[-1]
2299 # Rebuild url in english locale
2300 url = 'http://depositfiles.com/en/files/' + file_id
2302 # Retrieve file webpage with 'Free download' button pressed
2303 free_download_indication = { 'gateway_result' : '1' }
2304 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2306 self.report_download_webpage(file_id)
2307 webpage = urllib2.urlopen(request).read()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2312 # Search for the real file URL
2313 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2314 if (mobj is None) or (mobj.group(1) is None):
2315 # Try to figure out reason of the error.
2316 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2317 if (mobj is not None) and (mobj.group(1) is not None):
2318 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2319 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2321 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2324 file_url = mobj.group(1)
2325 file_extension = os.path.splitext(file_url)[1][1:]
2327 # Search for file title
2328 mobj = re.search(r'<b title="(.*?)">', webpage)
2330 self._downloader.trouble(u'ERROR: unable to extract title')
2332 file_title = mobj.group(1).decode('utf-8')
2335 # Process file information
2336 self._downloader.process_info({
2337 'id': file_id.decode('utf-8'),
2338 'url': file_url.decode('utf-8'),
2340 'upload_date': u'NA',
2341 'title': file_title,
2342 'stitle': file_title,
2343 'ext': file_extension.decode('utf-8'),
2347 except UnavailableVideoError, err:
2348 self._downloader.trouble(u'ERROR: unable to download file')
2350 class FacebookIE(InfoExtractor):
2351 """Information Extractor for Facebook"""
2353 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2354 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2355 _NETRC_MACHINE = 'facebook'
2356 _available_formats = ['highqual', 'lowqual']
2357 _video_extensions = {
2362 def __init__(self, downloader=None):
2363 InfoExtractor.__init__(self, downloader)
2367 return (re.match(FacebookIE._VALID_URL, url) is not None)
2369 def _reporter(self, message):
2370 """Add header and report message."""
2371 self._downloader.to_screen(u'[facebook] %s' % message)
2373 def report_login(self):
2374 """Report attempt to log in."""
2375 self._reporter(u'Logging in')
2377 def report_video_webpage_download(self, video_id):
2378 """Report attempt to download video webpage."""
2379 self._reporter(u'%s: Downloading video webpage' % video_id)
2381 def report_information_extraction(self, video_id):
2382 """Report attempt to extract video information."""
2383 self._reporter(u'%s: Extracting video information' % video_id)
2385 def _parse_page(self, video_webpage):
2386 """Extract video information from page"""
2388 data = {'title': r'class="video_title datawrap">(.*?)</',
2389 'description': r'<div class="datawrap">(.*?)</div>',
2390 'owner': r'\("video_owner_name", "(.*?)"\)',
2391 'upload_date': r'data-date="(.*?)"',
2392 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2395 for piece in data.keys():
2396 mobj = re.search(data[piece], video_webpage)
2397 if mobj is not None:
2398 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2402 for fmt in self._available_formats:
2403 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2404 if mobj is not None:
2405 # URL is in a Javascript segment inside an escaped Unicode format within
2406 # the generally utf-8 page
2407 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2408 video_info['video_urls'] = video_urls
2412 def _real_initialize(self):
2413 if self._downloader is None:
2418 downloader_params = self._downloader.params
2420 # Attempt to use provided username and password or .netrc data
2421 if downloader_params.get('username', None) is not None:
2422 useremail = downloader_params['username']
2423 password = downloader_params['password']
2424 elif downloader_params.get('usenetrc', False):
2426 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2427 if info is not None:
2431 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2432 except (IOError, netrc.NetrcParseError), err:
2433 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2436 if useremail is None:
2445 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2448 login_results = urllib2.urlopen(request).read()
2449 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2450 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2453 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2456 def _real_extract(self, url):
2457 mobj = re.match(self._VALID_URL, url)
2459 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2461 video_id = mobj.group('ID')
2464 self.report_video_webpage_download(video_id)
2465 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2467 page = urllib2.urlopen(request)
2468 video_webpage = page.read()
2469 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2470 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2473 # Start extracting information
2474 self.report_information_extraction(video_id)
2476 # Extract information
2477 video_info = self._parse_page(video_webpage)
2480 if 'owner' not in video_info:
2481 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2483 video_uploader = video_info['owner']
2486 if 'title' not in video_info:
2487 self._downloader.trouble(u'ERROR: unable to extract video title')
2489 video_title = video_info['title']
2490 video_title = video_title.decode('utf-8')
2491 video_title = sanitize_title(video_title)
2494 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2495 simple_title = simple_title.strip(ur'_')
2498 if 'thumbnail' not in video_info:
2499 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2500 video_thumbnail = ''
2502 video_thumbnail = video_info['thumbnail']
2506 if 'upload_date' in video_info:
2507 upload_time = video_info['upload_date']
2508 timetuple = email.utils.parsedate_tz(upload_time)
2509 if timetuple is not None:
2511 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2516 video_description = 'No description available.'
2517 if (self._downloader.params.get('forcedescription', False) and
2518 'description' in video_info):
2519 video_description = video_info['description']
2521 url_map = video_info['video_urls']
2522 if len(url_map.keys()) > 0:
2523 # Decide which formats to download
2524 req_format = self._downloader.params.get('format', None)
2525 format_limit = self._downloader.params.get('format_limit', None)
2527 if format_limit is not None and format_limit in self._available_formats:
2528 format_list = self._available_formats[self._available_formats.index(format_limit):]
2530 format_list = self._available_formats
2531 existing_formats = [x for x in format_list if x in url_map]
2532 if len(existing_formats) == 0:
2533 self._downloader.trouble(u'ERROR: no known formats available for video')
2535 if req_format is None:
2536 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2537 elif req_format == '-1':
2538 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2541 if req_format not in url_map:
2542 self._downloader.trouble(u'ERROR: requested format not available')
2544 video_url_list = [(req_format, url_map[req_format])] # Specific format
2546 for format_param, video_real_url in video_url_list:
2548 # At this point we have a new video
2549 self._downloader.increment_downloads()
2552 video_extension = self._video_extensions.get(format_param, 'mp4')
2554 # Find the video URL in fmt_url_map or conn paramters
2556 # Process video information
2557 self._downloader.process_info({
2558 'id': video_id.decode('utf-8'),
2559 'url': video_real_url.decode('utf-8'),
2560 'uploader': video_uploader.decode('utf-8'),
2561 'upload_date': upload_date,
2562 'title': video_title,
2563 'stitle': simple_title,
2564 'ext': video_extension.decode('utf-8'),
2565 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2566 'thumbnail': video_thumbnail.decode('utf-8'),
2567 'description': video_description.decode('utf-8'),
2570 except UnavailableVideoError, err:
2571 self._downloader.trouble(u'\nERROR: unable to download video')
2573 class PostProcessor(object):
2574 """Post Processor class.
2576 PostProcessor objects can be added to downloaders with their
2577 add_post_processor() method. When the downloader has finished a
2578 successful download, it will take its internal chain of PostProcessors
2579 and start calling the run() method on each one of them, first with
2580 an initial argument and then with the returned value of the previous
2583 The chain will be stopped if one of them ever returns None or the end
2584 of the chain is reached.
2586 PostProcessor objects follow a "mutual registration" process similar
2587 to InfoExtractor objects.
2592 def __init__(self, downloader=None):
2593 self._downloader = downloader
2595 def set_downloader(self, downloader):
2596 """Sets the downloader for this PP."""
2597 self._downloader = downloader
2599 def run(self, information):
2600 """Run the PostProcessor.
2602 The "information" argument is a dictionary like the ones
2603 composed by InfoExtractors. The only difference is that this
2604 one has an extra field called "filepath" that points to the
2607 When this method returns None, the postprocessing chain is
2608 stopped. However, this method may return an information
2609 dictionary that will be passed to the next postprocessing
2610 object in the chain. It can be the one it received after
2611 changing some fields.
2613 In addition, this method may raise a PostProcessingError
2614 exception that will be taken into account by the downloader
2617 return information # by default, do nothing
2619 class FFmpegExtractAudioPP(PostProcessor):
2621 def __init__(self, downloader=None, preferredcodec=None):
2622 PostProcessor.__init__(self, downloader)
2623 if preferredcodec is None:
2624 preferredcodec = 'best'
2625 self._preferredcodec = preferredcodec
2628 def get_audio_codec(path):
2630 cmd = ['ffprobe', '-show_streams', '--', path]
2631 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2632 output = handle.communicate()[0]
2633 if handle.wait() != 0:
2635 except (IOError, OSError):
2638 for line in output.split('\n'):
2639 if line.startswith('codec_name='):
2640 audio_codec = line.split('=')[1].strip()
2641 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2646 def run_ffmpeg(path, out_path, codec, more_opts):
2648 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2649 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2651 except (IOError, OSError):
2654 def run(self, information):
2655 path = information['filepath']
2657 filecodec = self.get_audio_codec(path)
2658 if filecodec is None:
2659 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2663 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2664 if filecodec == 'aac' or filecodec == 'mp3':
2665 # Lossless if possible
2667 extension = filecodec
2668 if filecodec == 'aac':
2669 more_opts = ['-f', 'adts']
2672 acodec = 'libmp3lame'
2674 more_opts = ['-ab', '128k']
2676 # We convert the audio (lossy)
2677 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2678 extension = self._preferredcodec
2679 more_opts = ['-ab', '128k']
2680 if self._preferredcodec == 'aac':
2681 more_opts += ['-f', 'adts']
2683 (prefix, ext) = os.path.splitext(path)
2684 new_path = prefix + '.' + extension
2685 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2686 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2689 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2694 except (IOError, OSError):
2695 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2698 information['filepath'] = new_path
2701 ### MAIN PROGRAM ###
2702 if __name__ == '__main__':
2704 # Modules needed only when running the main program
2708 # Function to update the program file with the latest version from the repository.
2709 def update_self(downloader, filename):
2710 # Note: downloader only used for options
2711 if not os.access(filename, os.W_OK):
2712 sys.exit('ERROR: no write permissions on %s' % filename)
2714 downloader.to_screen('Updating to latest stable version...')
2716 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2717 latest_version = urllib.urlopen(latest_url).read().strip()
2718 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2719 newcontent = urllib.urlopen(prog_url).read()
2720 except (IOError, OSError), err:
2721 sys.exit('ERROR: unable to download latest version')
2723 stream = open(filename, 'w')
2724 stream.write(newcontent)
2726 except (IOError, OSError), err:
2727 sys.exit('ERROR: unable to overwrite current version')
2728 downloader.to_screen('Updated to version %s' % latest_version)
2730 # Parse command line
2731 parser = optparse.OptionParser(
2732 usage='Usage: %prog [options] url...',
2733 version='2011.08.04',
2734 conflict_handler='resolve',
2737 parser.add_option('-h', '--help',
2738 action='help', help='print this help text and exit')
2739 parser.add_option('-v', '--version',
2740 action='version', help='print program version and exit')
2741 parser.add_option('-U', '--update',
2742 action='store_true', dest='update_self', help='update this program to latest stable version')
2743 parser.add_option('-i', '--ignore-errors',
2744 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2745 parser.add_option('-r', '--rate-limit',
2746 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2747 parser.add_option('-R', '--retries',
2748 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2749 parser.add_option('--playlist-start',
2750 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2751 parser.add_option('--playlist-end',
2752 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2753 parser.add_option('--dump-user-agent',
2754 action='store_true', dest='dump_user_agent',
2755 help='display the current browser identification', default=False)
2757 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2758 authentication.add_option('-u', '--username',
2759 dest='username', metavar='USERNAME', help='account username')
2760 authentication.add_option('-p', '--password',
2761 dest='password', metavar='PASSWORD', help='account password')
2762 authentication.add_option('-n', '--netrc',
2763 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2764 parser.add_option_group(authentication)
2766 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2767 video_format.add_option('-f', '--format',
2768 action='store', dest='format', metavar='FORMAT', help='video format code')
2769 video_format.add_option('--all-formats',
2770 action='store_const', dest='format', help='download all available video formats', const='-1')
2771 video_format.add_option('--max-quality',
2772 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2773 parser.add_option_group(video_format)
2775 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2776 verbosity.add_option('-q', '--quiet',
2777 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2778 verbosity.add_option('-s', '--simulate',
2779 action='store_true', dest='simulate', help='do not download video', default=False)
2780 verbosity.add_option('-g', '--get-url',
2781 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2782 verbosity.add_option('-e', '--get-title',
2783 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2784 verbosity.add_option('--get-thumbnail',
2785 action='store_true', dest='getthumbnail',
2786 help='simulate, quiet but print thumbnail URL', default=False)
2787 verbosity.add_option('--get-description',
2788 action='store_true', dest='getdescription',
2789 help='simulate, quiet but print video description', default=False)
2790 verbosity.add_option('--get-filename',
2791 action='store_true', dest='getfilename',
2792 help='simulate, quiet but print output filename', default=False)
2793 verbosity.add_option('--no-progress',
2794 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2795 verbosity.add_option('--console-title',
2796 action='store_true', dest='consoletitle',
2797 help='display progress in console titlebar', default=False)
2798 parser.add_option_group(verbosity)
2800 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2801 filesystem.add_option('-t', '--title',
2802 action='store_true', dest='usetitle', help='use title in file name', default=False)
2803 filesystem.add_option('-l', '--literal',
2804 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2805 filesystem.add_option('-A', '--auto-number',
2806 action='store_true', dest='autonumber',
2807 help='number downloaded files starting from 00000', default=False)
2808 filesystem.add_option('-o', '--output',
2809 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2810 filesystem.add_option('-a', '--batch-file',
2811 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2812 filesystem.add_option('-w', '--no-overwrites',
2813 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2814 filesystem.add_option('-c', '--continue',
2815 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2816 filesystem.add_option('--cookies',
2817 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2818 filesystem.add_option('--no-part',
2819 action='store_true', dest='nopart', help='do not use .part files', default=False)
2820 filesystem.add_option('--no-mtime',
2821 action='store_false', dest='updatetime',
2822 help='do not use the Last-modified header to set the file modification time', default=True)
2823 parser.add_option_group(filesystem)
2825 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2826 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2827 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2828 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2829 help='"best", "aac" or "mp3"; best by default')
2830 parser.add_option_group(postproc)
2832 (opts, args) = parser.parse_args()
2834 # Open appropriate CookieJar
2835 if opts.cookiefile is None:
2836 jar = cookielib.CookieJar()
2839 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2840 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2842 except (IOError, OSError), err:
2843 sys.exit(u'ERROR: unable to open cookie file')
2846 if opts.dump_user_agent:
2847 print std_headers['User-Agent']
2850 # General configuration
2851 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2852 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2853 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2855 # Batch file verification
2857 if opts.batchfile is not None:
2859 if opts.batchfile == '-':
2862 batchfd = open(opts.batchfile, 'r')
2863 batchurls = batchfd.readlines()
2864 batchurls = [x.strip() for x in batchurls]
2865 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2867 sys.exit(u'ERROR: batch file could not be read')
2868 all_urls = batchurls + args
2870 # Conflicting, missing and erroneous options
2871 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2872 parser.error(u'using .netrc conflicts with giving username/password')
2873 if opts.password is not None and opts.username is None:
2874 parser.error(u'account username missing')
2875 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2876 parser.error(u'using output template conflicts with using title, literal title or auto number')
2877 if opts.usetitle and opts.useliteral:
2878 parser.error(u'using title conflicts with using literal title')
2879 if opts.username is not None and opts.password is None:
2880 opts.password = getpass.getpass(u'Type account password and press return:')
2881 if opts.ratelimit is not None:
2882 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2883 if numeric_limit is None:
2884 parser.error(u'invalid rate limit specified')
2885 opts.ratelimit = numeric_limit
2886 if opts.retries is not None:
2888 opts.retries = long(opts.retries)
2889 except (TypeError, ValueError), err:
2890 parser.error(u'invalid retry count specified')
2892 opts.playliststart = long(opts.playliststart)
2893 if opts.playliststart <= 0:
2895 except (TypeError, ValueError), err:
2896 parser.error(u'invalid playlist start number specified')
2898 opts.playlistend = long(opts.playlistend)
2899 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2901 except (TypeError, ValueError), err:
2902 parser.error(u'invalid playlist end number specified')
2903 if opts.extractaudio:
2904 if opts.audioformat not in ['best', 'aac', 'mp3']:
2905 parser.error(u'invalid audio format specified')
2907 # Information extractors
2908 youtube_ie = YoutubeIE()
2909 metacafe_ie = MetacafeIE(youtube_ie)
2910 dailymotion_ie = DailymotionIE()
2911 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2912 youtube_user_ie = YoutubeUserIE(youtube_ie)
2913 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2914 google_ie = GoogleIE()
2915 google_search_ie = GoogleSearchIE(google_ie)
2916 photobucket_ie = PhotobucketIE()
2917 yahoo_ie = YahooIE()
2918 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2919 deposit_files_ie = DepositFilesIE()
2920 facebook_ie = FacebookIE()
2921 generic_ie = GenericIE()
2924 fd = FileDownloader({
2925 'usenetrc': opts.usenetrc,
2926 'username': opts.username,
2927 'password': opts.password,
2928 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2929 'forceurl': opts.geturl,
2930 'forcetitle': opts.gettitle,
2931 'forcethumbnail': opts.getthumbnail,
2932 'forcedescription': opts.getdescription,
2933 'forcefilename': opts.getfilename,
2934 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2935 'format': opts.format,
2936 'format_limit': opts.format_limit,
2937 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2938 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2939 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2940 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2941 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2942 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2943 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2944 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2945 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2946 or u'%(id)s.%(ext)s'),
2947 'ignoreerrors': opts.ignoreerrors,
2948 'ratelimit': opts.ratelimit,
2949 'nooverwrites': opts.nooverwrites,
2950 'retries': opts.retries,
2951 'continuedl': opts.continue_dl,
2952 'noprogress': opts.noprogress,
2953 'playliststart': opts.playliststart,
2954 'playlistend': opts.playlistend,
2955 'logtostderr': opts.outtmpl == '-',
2956 'consoletitle': opts.consoletitle,
2957 'nopart': opts.nopart,
2958 'updatetime': opts.updatetime,
2960 fd.add_info_extractor(youtube_search_ie)
2961 fd.add_info_extractor(youtube_pl_ie)
2962 fd.add_info_extractor(youtube_user_ie)
2963 fd.add_info_extractor(metacafe_ie)
2964 fd.add_info_extractor(dailymotion_ie)
2965 fd.add_info_extractor(youtube_ie)
2966 fd.add_info_extractor(google_ie)
2967 fd.add_info_extractor(google_search_ie)
2968 fd.add_info_extractor(photobucket_ie)
2969 fd.add_info_extractor(yahoo_ie)
2970 fd.add_info_extractor(yahoo_search_ie)
2971 fd.add_info_extractor(deposit_files_ie)
2972 fd.add_info_extractor(facebook_ie)
2974 # This must come last since it's the
2975 # fallback if none of the others work
2976 fd.add_info_extractor(generic_ie)
2979 if opts.extractaudio:
2980 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
2983 if opts.update_self:
2984 update_self(fd, sys.argv[0])
2987 if len(all_urls) < 1:
2988 if not opts.update_self:
2989 parser.error(u'you must provide at least one URL')
2992 retcode = fd.download(all_urls)
2994 # Dump cookie jar if requested
2995 if opts.cookiefile is not None:
2998 except (IOError, OSError), err:
2999 sys.exit(u'ERROR: unable to save cookie jar')
3003 except DownloadError:
3005 except SameFileError:
3006 sys.exit(u'ERROR: fixed output name but more than one file to download')
3007 except KeyboardInterrupt:
3008 sys.exit(u'\nERROR: Interrupted by user')
3010 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: