2 # -*- coding: utf-8 -*-
5 "Ricardo Garcia Gonzalez",
14 __license__ = "Public Domain"
15 __version__ = '2011.08.04'
40 # parse_qs was moved from the cgi module to the urlparse module recently.
42 from urlparse import parse_qs
44 from cgi import parse_qs
47 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
48 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
49 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
50 'Accept-Encoding': 'gzip, deflate',
51 'Accept-Language': 'en-us,en;q=0.5',
54 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
56 def preferredencoding():
57 """Get preferred encoding.
59 Returns the best encoding scheme for the system, based on
60 locale.getpreferredencoding() and some further tweaks.
62 def yield_preferredencoding():
64 pref = locale.getpreferredencoding()
70 return yield_preferredencoding().next()
72 def htmlentity_transform(matchobj):
73 """Transforms an HTML entity to a Unicode character.
75 This function receives a match object and is intended to be used with
76 the re.sub() function.
78 entity = matchobj.group(1)
80 # Known non-numeric HTML entity
81 if entity in htmlentitydefs.name2codepoint:
82 return unichr(htmlentitydefs.name2codepoint[entity])
85 mobj = re.match(ur'(?u)#(x?\d+)', entity)
87 numstr = mobj.group(1)
88 if numstr.startswith(u'x'):
90 numstr = u'0%s' % numstr
93 return unichr(long(numstr, base))
95 # Unknown entity in name, return its literal representation
96 return (u'&%s;' % entity)
98 def sanitize_title(utitle):
99 """Sanitizes a video title so it could be used as part of a filename."""
100 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
101 return utitle.replace(unicode(os.sep), u'%')
103 def sanitize_open(filename, open_mode):
104 """Try to open the given filename, and slightly tweak it if this fails.
106 Attempts to open the given filename. If this fails, it tries to change
107 the filename slightly, step by step, until it's either able to open it
108 or it fails and raises a final exception, like the standard open()
111 It returns the tuple (stream, definitive_file_name).
115 if sys.platform == 'win32':
117 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
118 return (sys.stdout, filename)
119 stream = open(filename, open_mode)
120 return (stream, filename)
121 except (IOError, OSError), err:
122 # In case of error, try to remove win32 forbidden chars
123 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
125 # An exception here should be caught in the caller
126 stream = open(filename, open_mode)
127 return (stream, filename)
129 def timeconvert(timestr):
130 """Convert RFC 2822 defined time string into system timestamp"""
132 timetuple = email.utils.parsedate_tz(timestr)
133 if timetuple is not None:
134 timestamp = email.utils.mktime_tz(timetuple)
137 class DownloadError(Exception):
138 """Download Error exception.
140 This exception may be thrown by FileDownloader objects if they are not
141 configured to continue on errors. They will contain the appropriate
146 class SameFileError(Exception):
147 """Same File exception.
149 This exception will be thrown by FileDownloader objects if they detect
150 multiple files would have to be downloaded to the same file on disk.
154 class PostProcessingError(Exception):
155 """Post Processing exception.
157 This exception may be raised by PostProcessor's .run() method to
158 indicate an error in the postprocessing task.
162 class UnavailableVideoError(Exception):
163 """Unavailable Format exception.
165 This exception will be thrown when a video is requested
166 in a format that is not available for that video.
170 class ContentTooShortError(Exception):
171 """Content Too Short exception.
173 This exception may be raised by FileDownloader objects when a file they
174 download is too small for what the server announced first, indicating
175 the connection was probably interrupted.
181 def __init__(self, downloaded, expected):
182 self.downloaded = downloaded
183 self.expected = expected
185 class YoutubeDLHandler(urllib2.HTTPHandler):
186 """Handler for HTTP requests and responses.
188 This class, when installed with an OpenerDirector, automatically adds
189 the standard headers to every HTTP request and handles gzipped and
190 deflated responses from web servers. If compression is to be avoided in
191 a particular request, the original request in the program code only has
192 to include the HTTP header "Youtubedl-No-Compression", which will be
193 removed before making the real request.
195 Part of this code was copied from:
197 http://techknack.net/python-urllib2-handlers/
199 Andrew Rowls, the author of that code, agreed to release it to the
206 return zlib.decompress(data, -zlib.MAX_WBITS)
208 return zlib.decompress(data)
211 def addinfourl_wrapper(stream, headers, url, code):
212 if hasattr(urllib2.addinfourl, 'getcode'):
213 return urllib2.addinfourl(stream, headers, url, code)
214 ret = urllib2.addinfourl(stream, headers, url)
218 def http_request(self, req):
219 for h in std_headers:
222 req.add_header(h, std_headers[h])
223 if 'Youtubedl-no-compression' in req.headers:
224 if 'Accept-encoding' in req.headers:
225 del req.headers['Accept-encoding']
226 del req.headers['Youtubedl-no-compression']
229 def http_response(self, req, resp):
232 if resp.headers.get('Content-encoding', '') == 'gzip':
233 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
234 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
235 resp.msg = old_resp.msg
237 if resp.headers.get('Content-encoding', '') == 'deflate':
238 gz = StringIO.StringIO(self.deflate(resp.read()))
239 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
240 resp.msg = old_resp.msg
243 class FileDownloader(object):
244 """File Downloader class.
246 File downloader objects are the ones responsible of downloading the
247 actual video file and writing it to disk if the user has requested
248 it, among some other tasks. In most cases there should be one per
249 program. As, given a video URL, the downloader doesn't know how to
250 extract all the needed information, task that InfoExtractors do, it
251 has to pass the URL to one of them.
253 For this, file downloader objects have a method that allows
254 InfoExtractors to be registered in a given order. When it is passed
255 a URL, the file downloader handles it to the first InfoExtractor it
256 finds that reports being able to handle it. The InfoExtractor extracts
257 all the information about the video or videos the URL refers to, and
258 asks the FileDownloader to process the video information, possibly
259 downloading the video.
261 File downloaders accept a lot of parameters. In order not to saturate
262 the object constructor with arguments, it receives a dictionary of
263 options instead. These options are available through the params
264 attribute for the InfoExtractors to use. The FileDownloader also
265 registers itself as the downloader in charge for the InfoExtractors
266 that are added to it, so this is a "mutual registration".
270 username: Username for authentication purposes.
271 password: Password for authentication purposes.
272 usenetrc: Use netrc for authentication instead.
273 quiet: Do not print messages to stdout.
274 forceurl: Force printing final URL.
275 forcetitle: Force printing title.
276 forcethumbnail: Force printing thumbnail URL.
277 forcedescription: Force printing description.
278 forcefilename: Force printing final filename.
279 simulate: Do not download the video files.
280 format: Video format code.
281 format_limit: Highest quality format to try.
282 outtmpl: Template for output names.
283 ignoreerrors: Do not stop on download errors.
284 ratelimit: Download speed limit, in bytes/sec.
285 nooverwrites: Prevent overwriting files.
286 retries: Number of times to retry for HTTP error 5xx
287 continuedl: Try to continue downloads if possible.
288 noprogress: Do not print the progress bar.
289 playliststart: Playlist item to start at.
290 playlistend: Playlist item to end at.
291 logtostderr: Log messages to stderr instead of stdout.
292 consoletitle: Display progress in console window's titlebar.
293 nopart: Do not use temporary .part files.
294 updatetime: Use the Last-modified header to set output file timestamps.
300 _download_retcode = None
301 _num_downloads = None
304 def __init__(self, params):
305 """Create a FileDownloader object with the given options."""
308 self._download_retcode = 0
309 self._num_downloads = 0
310 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
314 def pmkdir(filename):
315 """Create directory components in filename. Similar to Unix "mkdir -p"."""
316 components = filename.split(os.sep)
317 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
318 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
319 for dir in aggregate:
320 if not os.path.exists(dir):
324 def format_bytes(bytes):
327 if type(bytes) is str:
332 exponent = long(math.log(bytes, 1024.0))
333 suffix = 'bkMGTPEZY'[exponent]
334 converted = float(bytes) / float(1024**exponent)
335 return '%.2f%s' % (converted, suffix)
338 def calc_percent(byte_counter, data_len):
341 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
344 def calc_eta(start, now, total, current):
348 if current == 0 or dif < 0.001: # One millisecond
350 rate = float(current) / dif
351 eta = long((float(total) - float(current)) / rate)
352 (eta_mins, eta_secs) = divmod(eta, 60)
355 return '%02d:%02d' % (eta_mins, eta_secs)
358 def calc_speed(start, now, bytes):
360 if bytes == 0 or dif < 0.001: # One millisecond
361 return '%10s' % '---b/s'
362 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
365 def best_block_size(elapsed_time, bytes):
366 new_min = max(bytes / 2.0, 1.0)
367 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
368 if elapsed_time < 0.001:
370 rate = bytes / elapsed_time
378 def parse_bytes(bytestr):
379 """Parse a string indicating a byte quantity into a long integer."""
380 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
383 number = float(matchobj.group(1))
384 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
385 return long(round(number * multiplier))
387 def add_info_extractor(self, ie):
388 """Add an InfoExtractor object to the end of the list."""
390 ie.set_downloader(self)
392 def add_post_processor(self, pp):
393 """Add a PostProcessor object to the end of the chain."""
395 pp.set_downloader(self)
397 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
398 """Print message to stdout if not in quiet mode."""
400 if not self.params.get('quiet', False):
401 terminator = [u'\n', u''][skip_eol]
402 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
403 self._screen_file.flush()
404 except (UnicodeEncodeError), err:
405 if not ignore_encoding_errors:
408 def to_stderr(self, message):
409 """Print message to stderr."""
410 print >>sys.stderr, message.encode(preferredencoding())
412 def to_cons_title(self, message):
413 """Set console/terminal window title to message."""
414 if not self.params.get('consoletitle', False):
416 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
417 # c_wchar_p() might not be necessary if `message` is
418 # already of type unicode()
419 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
420 elif 'TERM' in os.environ:
421 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
423 def fixed_template(self):
424 """Checks if the output template is fixed."""
425 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
427 def trouble(self, message=None):
428 """Determine action to take when a download problem appears.
430 Depending on if the downloader has been configured to ignore
431 download errors or not, this method may throw an exception or
432 not when errors are found, after printing the message.
434 if message is not None:
435 self.to_stderr(message)
436 if not self.params.get('ignoreerrors', False):
437 raise DownloadError(message)
438 self._download_retcode = 1
440 def slow_down(self, start_time, byte_counter):
441 """Sleep if the download speed is over the rate limit."""
442 rate_limit = self.params.get('ratelimit', None)
443 if rate_limit is None or byte_counter == 0:
446 elapsed = now - start_time
449 speed = float(byte_counter) / elapsed
450 if speed > rate_limit:
451 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
453 def temp_name(self, filename):
454 """Returns a temporary filename for the given filename."""
455 if self.params.get('nopart', False) or filename == u'-' or \
456 (os.path.exists(filename) and not os.path.isfile(filename)):
458 return filename + u'.part'
460 def undo_temp_name(self, filename):
461 if filename.endswith(u'.part'):
462 return filename[:-len(u'.part')]
465 def try_rename(self, old_filename, new_filename):
467 if old_filename == new_filename:
469 os.rename(old_filename, new_filename)
470 except (IOError, OSError), err:
471 self.trouble(u'ERROR: unable to rename file')
473 def try_utime(self, filename, last_modified_hdr):
474 """Try to set the last-modified time of the given file."""
475 if last_modified_hdr is None:
477 if not os.path.isfile(filename):
479 timestr = last_modified_hdr
482 filetime = timeconvert(timestr)
486 os.utime(filename,(time.time(), filetime))
490 def report_destination(self, filename):
491 """Report destination filename."""
492 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
494 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
495 """Report download progress."""
496 if self.params.get('noprogress', False):
498 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
499 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
500 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
501 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
503 def report_resuming_byte(self, resume_len):
504 """Report attempt to resume at given byte."""
505 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
507 def report_retry(self, count, retries):
508 """Report retry in case of HTTP error 5xx"""
509 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
511 def report_file_already_downloaded(self, file_name):
512 """Report file has already been fully downloaded."""
514 self.to_screen(u'[download] %s has already been downloaded' % file_name)
515 except (UnicodeEncodeError), err:
516 self.to_screen(u'[download] The file has already been downloaded')
518 def report_unable_to_resume(self):
519 """Report it was impossible to resume download."""
520 self.to_screen(u'[download] Unable to resume')
522 def report_finish(self):
523 """Report download finished."""
524 if self.params.get('noprogress', False):
525 self.to_screen(u'[download] Download completed')
529 def increment_downloads(self):
530 """Increment the ordinal that assigns a number to each file."""
531 self._num_downloads += 1
533 def prepare_filename(self, info_dict):
534 """Generate the output filename."""
536 template_dict = dict(info_dict)
537 template_dict['epoch'] = unicode(long(time.time()))
538 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
539 filename = self.params['outtmpl'] % template_dict
541 except (ValueError, KeyError), err:
542 self.trouble(u'ERROR: invalid system charset or erroneous output template')
545 def process_info(self, info_dict):
546 """Process a single dictionary returned by an InfoExtractor."""
547 filename = self.prepare_filename(info_dict)
548 # Do nothing else if in simulate mode
549 if self.params.get('simulate', False):
551 if self.params.get('forcetitle', False):
552 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
553 if self.params.get('forceurl', False):
554 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
555 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
556 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
557 if self.params.get('forcedescription', False) and 'description' in info_dict:
558 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
559 if self.params.get('forcefilename', False) and filename is not None:
560 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
566 if self.params.get('nooverwrites', False) and os.path.exists(filename):
567 self.to_stderr(u'WARNING: file exists and will be skipped')
571 self.pmkdir(filename)
572 except (OSError, IOError), err:
573 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
577 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
578 except (OSError, IOError), err:
579 raise UnavailableVideoError
580 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
581 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
583 except (ContentTooShortError, ), err:
584 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
589 self.post_process(filename, info_dict)
590 except (PostProcessingError), err:
591 self.trouble(u'ERROR: postprocessing: %s' % str(err))
594 def download(self, url_list):
595 """Download a given list of URLs."""
596 if len(url_list) > 1 and self.fixed_template():
597 raise SameFileError(self.params['outtmpl'])
600 suitable_found = False
602 # Go to next InfoExtractor if not suitable
603 if not ie.suitable(url):
606 # Suitable InfoExtractor found
607 suitable_found = True
609 # Extract information from URL and process it
612 # Suitable InfoExtractor had been found; go to next URL
615 if not suitable_found:
616 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
618 return self._download_retcode
620 def post_process(self, filename, ie_info):
621 """Run the postprocessing chain on the given file."""
623 info['filepath'] = filename
629 def _download_with_rtmpdump(self, filename, url, player_url):
630 self.report_destination(filename)
631 tmpfilename = self.temp_name(filename)
633 # Check for rtmpdump first
635 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
636 except (OSError, IOError):
637 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
640 # Download using rtmpdump. rtmpdump returns exit code 2 when
641 # the connection was interrumpted and resuming appears to be
642 # possible. This is part of rtmpdump's normal usage, AFAIK.
643 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
644 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
645 while retval == 2 or retval == 1:
646 prevsize = os.path.getsize(tmpfilename)
647 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
648 time.sleep(5.0) # This seems to be needed
649 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
650 cursize = os.path.getsize(tmpfilename)
651 if prevsize == cursize and retval == 1:
654 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
655 self.try_rename(tmpfilename, filename)
658 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
661 def _do_download(self, filename, url, player_url):
662 # Check file already present
663 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
664 self.report_file_already_downloaded(filename)
667 # Attempt to download using rtmpdump
668 if url.startswith('rtmp'):
669 return self._download_with_rtmpdump(filename, url, player_url)
671 tmpfilename = self.temp_name(filename)
675 # Do not include the Accept-Encoding header
676 headers = {'Youtubedl-no-compression': 'True'}
677 basic_request = urllib2.Request(url, None, headers)
678 request = urllib2.Request(url, None, headers)
680 # Establish possible resume length
681 if os.path.isfile(tmpfilename):
682 resume_len = os.path.getsize(tmpfilename)
686 # Request parameters in case of being able to resume
687 if self.params.get('continuedl', False) and resume_len != 0:
688 self.report_resuming_byte(resume_len)
689 request.add_header('Range','bytes=%d-' % resume_len)
693 retries = self.params.get('retries', 0)
694 while count <= retries:
695 # Establish connection
697 data = urllib2.urlopen(request)
699 except (urllib2.HTTPError, ), err:
700 if (err.code < 500 or err.code >= 600) and err.code != 416:
701 # Unexpected HTTP error
703 elif err.code == 416:
704 # Unable to resume (requested range not satisfiable)
706 # Open the connection again without the range header
707 data = urllib2.urlopen(basic_request)
708 content_length = data.info()['Content-Length']
709 except (urllib2.HTTPError, ), err:
710 if err.code < 500 or err.code >= 600:
713 # Examine the reported length
714 if (content_length is not None and
715 (resume_len - 100 < long(content_length) < resume_len + 100)):
716 # The file had already been fully downloaded.
717 # Explanation to the above condition: in issue #175 it was revealed that
718 # YouTube sometimes adds or removes a few bytes from the end of the file,
719 # changing the file size slightly and causing problems for some users. So
720 # I decided to implement a suggested change and consider the file
721 # completely downloaded if the file size differs less than 100 bytes from
722 # the one in the hard drive.
723 self.report_file_already_downloaded(filename)
724 self.try_rename(tmpfilename, filename)
727 # The length does not match, we start the download over
728 self.report_unable_to_resume()
734 self.report_retry(count, retries)
737 self.trouble(u'ERROR: giving up after %s retries' % retries)
740 data_len = data.info().get('Content-length', None)
741 if data_len is not None:
742 data_len = long(data_len) + resume_len
743 data_len_str = self.format_bytes(data_len)
744 byte_counter = 0 + resume_len
750 data_block = data.read(block_size)
752 if len(data_block) == 0:
754 byte_counter += len(data_block)
756 # Open file just in time
759 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
760 filename = self.undo_temp_name(tmpfilename)
761 self.report_destination(filename)
762 except (OSError, IOError), err:
763 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
766 stream.write(data_block)
767 except (IOError, OSError), err:
768 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
770 block_size = self.best_block_size(after - before, len(data_block))
773 percent_str = self.calc_percent(byte_counter, data_len)
774 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
775 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
776 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
779 self.slow_down(start, byte_counter - resume_len)
783 if data_len is not None and byte_counter != data_len:
784 raise ContentTooShortError(byte_counter, long(data_len))
785 self.try_rename(tmpfilename, filename)
787 # Update file modification time
788 if self.params.get('updatetime', True):
789 self.try_utime(filename, data.info().get('last-modified', None))
793 class InfoExtractor(object):
794 """Information Extractor class.
796 Information extractors are the classes that, given a URL, extract
797 information from the video (or videos) the URL refers to. This
798 information includes the real video URL, the video title and simplified
799 title, author and others. The information is stored in a dictionary
800 which is then passed to the FileDownloader. The FileDownloader
801 processes this information possibly downloading the video to the file
802 system, among other possible outcomes. The dictionaries must include
803 the following fields:
805 id: Video identifier.
806 url: Final video URL.
807 uploader: Nickname of the video uploader.
808 title: Literal title.
809 stitle: Simplified title.
810 ext: Video filename extension.
811 format: Video format.
812 player_url: SWF Player URL (may be None).
814 The following fields are optional. Their primary purpose is to allow
815 youtube-dl to serve as the backend for a video search function, such
816 as the one in youtube2mp3. They are only used when their respective
817 forced printing functions are called:
819 thumbnail: Full URL to a video thumbnail image.
820 description: One-line video description.
822 Subclasses of this one should re-define the _real_initialize() and
823 _real_extract() methods, as well as the suitable() static method.
824 Probably, they should also be instantiated and added to the main
831 def __init__(self, downloader=None):
832 """Constructor. Receives an optional downloader."""
834 self.set_downloader(downloader)
838 """Receives a URL and returns True if suitable for this IE."""
841 def initialize(self):
842 """Initializes an instance (authentication, etc)."""
844 self._real_initialize()
847 def extract(self, url):
848 """Extracts URL information and returns it in list of dicts."""
850 return self._real_extract(url)
852 def set_downloader(self, downloader):
853 """Sets the downloader for this IE."""
854 self._downloader = downloader
856 def _real_initialize(self):
857 """Real initialization process. Redefine in subclasses."""
860 def _real_extract(self, url):
861 """Real extraction process. Redefine in subclasses."""
864 class YoutubeIE(InfoExtractor):
865 """Information extractor for youtube.com."""
867 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
868 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
869 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
870 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
871 _NETRC_MACHINE = 'youtube'
872 # Listed in order of quality
873 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
874 _video_extensions = {
880 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
887 return (re.match(YoutubeIE._VALID_URL, url) is not None)
889 def report_lang(self):
890 """Report attempt to set language."""
891 self._downloader.to_screen(u'[youtube] Setting language')
893 def report_login(self):
894 """Report attempt to log in."""
895 self._downloader.to_screen(u'[youtube] Logging in')
897 def report_age_confirmation(self):
898 """Report attempt to confirm age."""
899 self._downloader.to_screen(u'[youtube] Confirming age')
901 def report_video_webpage_download(self, video_id):
902 """Report attempt to download video webpage."""
903 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
905 def report_video_info_webpage_download(self, video_id):
906 """Report attempt to download video info webpage."""
907 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
909 def report_information_extraction(self, video_id):
910 """Report attempt to extract video information."""
911 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
913 def report_unavailable_format(self, video_id, format):
914 """Report extracted video URL."""
915 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
917 def report_rtmp_download(self):
918 """Indicate the download will use the RTMP protocol."""
919 self._downloader.to_screen(u'[youtube] RTMP download detected')
921 def _real_initialize(self):
922 if self._downloader is None:
927 downloader_params = self._downloader.params
929 # Attempt to use provided username and password or .netrc data
930 if downloader_params.get('username', None) is not None:
931 username = downloader_params['username']
932 password = downloader_params['password']
933 elif downloader_params.get('usenetrc', False):
935 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
940 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
941 except (IOError, netrc.NetrcParseError), err:
942 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
946 request = urllib2.Request(self._LANG_URL)
949 urllib2.urlopen(request).read()
950 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
951 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
954 # No authentication to be performed
960 'current_form': 'loginForm',
962 'action_login': 'Log In',
963 'username': username,
964 'password': password,
966 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
969 login_results = urllib2.urlopen(request).read()
970 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
971 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
974 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
980 'action_confirm': 'Confirm',
982 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
984 self.report_age_confirmation()
985 age_results = urllib2.urlopen(request).read()
986 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
987 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
990 def _real_extract(self, url):
991 # Extract video id from URL
992 mobj = re.match(self._VALID_URL, url)
994 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
996 video_id = mobj.group(2)
999 self.report_video_webpage_download(video_id)
1000 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1002 video_webpage = urllib2.urlopen(request).read()
1003 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1004 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1007 # Attempt to extract SWF player URL
1008 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1009 if mobj is not None:
1010 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1015 self.report_video_info_webpage_download(video_id)
1016 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1017 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1018 % (video_id, el_type))
1019 request = urllib2.Request(video_info_url)
1021 video_info_webpage = urllib2.urlopen(request).read()
1022 video_info = parse_qs(video_info_webpage)
1023 if 'token' in video_info:
1025 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1026 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1028 if 'token' not in video_info:
1029 if 'reason' in video_info:
1030 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1032 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1035 # Start extracting information
1036 self.report_information_extraction(video_id)
1039 if 'author' not in video_info:
1040 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1042 video_uploader = urllib.unquote_plus(video_info['author'][0])
1045 if 'title' not in video_info:
1046 self._downloader.trouble(u'ERROR: unable to extract video title')
1048 video_title = urllib.unquote_plus(video_info['title'][0])
1049 video_title = video_title.decode('utf-8')
1050 video_title = sanitize_title(video_title)
1053 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1054 simple_title = simple_title.strip(ur'_')
1057 if 'thumbnail_url' not in video_info:
1058 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1059 video_thumbnail = ''
1060 else: # don't panic if we can't find it
1061 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1065 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1066 if mobj is not None:
1067 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1068 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1069 for expression in format_expressions:
1071 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1076 video_description = 'No description available.'
1077 if self._downloader.params.get('forcedescription', False):
1078 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1079 if mobj is not None:
1080 video_description = mobj.group(1)
1083 video_token = urllib.unquote_plus(video_info['token'][0])
1085 # Decide which formats to download
1086 req_format = self._downloader.params.get('format', None)
1088 if 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1089 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1090 url_data = [dict(pairStr.split('=') for pairStr in uds.split('&')) for uds in url_data_strs]
1091 url_map = dict((ud['itag'], urllib.unquote(ud['url'])) for ud in url_data)
1092 format_limit = self._downloader.params.get('format_limit', None)
1093 if format_limit is not None and format_limit in self._available_formats:
1094 format_list = self._available_formats[self._available_formats.index(format_limit):]
1096 format_list = self._available_formats
1097 existing_formats = [x for x in format_list if x in url_map]
1098 if len(existing_formats) == 0:
1099 self._downloader.trouble(u'ERROR: no known formats available for video')
1101 if req_format is None:
1102 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1103 elif req_format == '-1':
1104 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1107 if req_format not in url_map:
1108 self._downloader.trouble(u'ERROR: requested format not available')
1110 video_url_list = [(req_format, url_map[req_format])] # Specific format
1112 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1113 self.report_rtmp_download()
1114 video_url_list = [(None, video_info['conn'][0])]
1117 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1120 for format_param, video_real_url in video_url_list:
1121 # At this point we have a new video
1122 self._downloader.increment_downloads()
1125 video_extension = self._video_extensions.get(format_param, 'flv')
1127 # Find the video URL in fmt_url_map or conn paramters
1129 # Process video information
1130 self._downloader.process_info({
1131 'id': video_id.decode('utf-8'),
1132 'url': video_real_url.decode('utf-8'),
1133 'uploader': video_uploader.decode('utf-8'),
1134 'upload_date': upload_date,
1135 'title': video_title,
1136 'stitle': simple_title,
1137 'ext': video_extension.decode('utf-8'),
1138 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1139 'thumbnail': video_thumbnail.decode('utf-8'),
1140 'description': video_description.decode('utf-8'),
1141 'player_url': player_url,
1143 except UnavailableVideoError, err:
1144 self._downloader.trouble(u'\nERROR: unable to download video')
1147 class MetacafeIE(InfoExtractor):
1148 """Information Extractor for metacafe.com."""
1150 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1151 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1152 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1155 def __init__(self, youtube_ie, downloader=None):
1156 InfoExtractor.__init__(self, downloader)
1157 self._youtube_ie = youtube_ie
1161 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1163 def report_disclaimer(self):
1164 """Report disclaimer retrieval."""
1165 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1167 def report_age_confirmation(self):
1168 """Report attempt to confirm age."""
1169 self._downloader.to_screen(u'[metacafe] Confirming age')
1171 def report_download_webpage(self, video_id):
1172 """Report webpage download."""
1173 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1175 def report_extraction(self, video_id):
1176 """Report information extraction."""
1177 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1179 def _real_initialize(self):
1180 # Retrieve disclaimer
1181 request = urllib2.Request(self._DISCLAIMER)
1183 self.report_disclaimer()
1184 disclaimer = urllib2.urlopen(request).read()
1185 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1186 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1192 'submit': "Continue - I'm over 18",
1194 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1196 self.report_age_confirmation()
1197 disclaimer = urllib2.urlopen(request).read()
1198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1199 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1202 def _real_extract(self, url):
1203 # Extract id and simplified title from URL
1204 mobj = re.match(self._VALID_URL, url)
1206 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1209 video_id = mobj.group(1)
1211 # Check if video comes from YouTube
1212 mobj2 = re.match(r'^yt-(.*)$', video_id)
1213 if mobj2 is not None:
1214 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1217 # At this point we have a new video
1218 self._downloader.increment_downloads()
1220 simple_title = mobj.group(2).decode('utf-8')
1222 # Retrieve video webpage to extract further information
1223 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1225 self.report_download_webpage(video_id)
1226 webpage = urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1231 # Extract URL, uploader and title from webpage
1232 self.report_extraction(video_id)
1233 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1234 if mobj is not None:
1235 mediaURL = urllib.unquote(mobj.group(1))
1236 video_extension = mediaURL[-3:]
1238 # Extract gdaKey if available
1239 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1241 video_url = mediaURL
1243 gdaKey = mobj.group(1)
1244 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1246 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1248 self._downloader.trouble(u'ERROR: unable to extract media URL')
1250 vardict = parse_qs(mobj.group(1))
1251 if 'mediaData' not in vardict:
1252 self._downloader.trouble(u'ERROR: unable to extract media URL')
1254 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1256 self._downloader.trouble(u'ERROR: unable to extract media URL')
1258 mediaURL = mobj.group(1).replace('\\/', '/')
1259 video_extension = mediaURL[-3:]
1260 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1262 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1264 self._downloader.trouble(u'ERROR: unable to extract title')
1266 video_title = mobj.group(1).decode('utf-8')
1267 video_title = sanitize_title(video_title)
1269 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1271 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1273 video_uploader = mobj.group(1)
1276 # Process video information
1277 self._downloader.process_info({
1278 'id': video_id.decode('utf-8'),
1279 'url': video_url.decode('utf-8'),
1280 'uploader': video_uploader.decode('utf-8'),
1281 'upload_date': u'NA',
1282 'title': video_title,
1283 'stitle': simple_title,
1284 'ext': video_extension.decode('utf-8'),
1288 except UnavailableVideoError:
1289 self._downloader.trouble(u'\nERROR: unable to download video')
1292 class DailymotionIE(InfoExtractor):
1293 """Information Extractor for Dailymotion"""
1295 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1297 def __init__(self, downloader=None):
1298 InfoExtractor.__init__(self, downloader)
1302 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1304 def report_download_webpage(self, video_id):
1305 """Report webpage download."""
1306 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1308 def report_extraction(self, video_id):
1309 """Report information extraction."""
1310 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1312 def _real_initialize(self):
1315 def _real_extract(self, url):
1316 # Extract id and simplified title from URL
1317 mobj = re.match(self._VALID_URL, url)
1319 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1322 # At this point we have a new video
1323 self._downloader.increment_downloads()
1324 video_id = mobj.group(1)
1326 simple_title = mobj.group(2).decode('utf-8')
1327 video_extension = 'flv'
1329 # Retrieve video webpage to extract further information
1330 request = urllib2.Request(url)
1332 self.report_download_webpage(video_id)
1333 webpage = urllib2.urlopen(request).read()
1334 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1335 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1338 # Extract URL, uploader and title from webpage
1339 self.report_extraction(video_id)
1340 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1342 self._downloader.trouble(u'ERROR: unable to extract media URL')
1344 mediaURL = urllib.unquote(mobj.group(1))
1346 # if needed add http://www.dailymotion.com/ if relative URL
1348 video_url = mediaURL
1350 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1351 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1353 self._downloader.trouble(u'ERROR: unable to extract title')
1355 video_title = mobj.group(1).decode('utf-8')
1356 video_title = sanitize_title(video_title)
1358 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1360 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1362 video_uploader = mobj.group(1)
1365 # Process video information
1366 self._downloader.process_info({
1367 'id': video_id.decode('utf-8'),
1368 'url': video_url.decode('utf-8'),
1369 'uploader': video_uploader.decode('utf-8'),
1370 'upload_date': u'NA',
1371 'title': video_title,
1372 'stitle': simple_title,
1373 'ext': video_extension.decode('utf-8'),
1377 except UnavailableVideoError:
1378 self._downloader.trouble(u'\nERROR: unable to download video')
1380 class GoogleIE(InfoExtractor):
1381 """Information extractor for video.google.com."""
1383 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1385 def __init__(self, downloader=None):
1386 InfoExtractor.__init__(self, downloader)
1390 return (re.match(GoogleIE._VALID_URL, url) is not None)
1392 def report_download_webpage(self, video_id):
1393 """Report webpage download."""
1394 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1396 def report_extraction(self, video_id):
1397 """Report information extraction."""
1398 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1400 def _real_initialize(self):
1403 def _real_extract(self, url):
1404 # Extract id from URL
1405 mobj = re.match(self._VALID_URL, url)
1407 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1410 # At this point we have a new video
1411 self._downloader.increment_downloads()
1412 video_id = mobj.group(1)
1414 video_extension = 'mp4'
1416 # Retrieve video webpage to extract further information
1417 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1419 self.report_download_webpage(video_id)
1420 webpage = urllib2.urlopen(request).read()
1421 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1422 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1425 # Extract URL, uploader, and title from webpage
1426 self.report_extraction(video_id)
1427 mobj = re.search(r"download_url:'([^']+)'", webpage)
1429 video_extension = 'flv'
1430 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1432 self._downloader.trouble(u'ERROR: unable to extract media URL')
1434 mediaURL = urllib.unquote(mobj.group(1))
1435 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1436 mediaURL = mediaURL.replace('\\x26', '\x26')
1438 video_url = mediaURL
1440 mobj = re.search(r'<title>(.*)</title>', webpage)
1442 self._downloader.trouble(u'ERROR: unable to extract title')
1444 video_title = mobj.group(1).decode('utf-8')
1445 video_title = sanitize_title(video_title)
1446 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1448 # Extract video description
1449 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1451 self._downloader.trouble(u'ERROR: unable to extract video description')
1453 video_description = mobj.group(1).decode('utf-8')
1454 if not video_description:
1455 video_description = 'No description available.'
1457 # Extract video thumbnail
1458 if self._downloader.params.get('forcethumbnail', False):
1459 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1461 webpage = urllib2.urlopen(request).read()
1462 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1463 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1465 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1467 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1469 video_thumbnail = mobj.group(1)
1470 else: # we need something to pass to process_info
1471 video_thumbnail = ''
1475 # Process video information
1476 self._downloader.process_info({
1477 'id': video_id.decode('utf-8'),
1478 'url': video_url.decode('utf-8'),
1480 'upload_date': u'NA',
1481 'title': video_title,
1482 'stitle': simple_title,
1483 'ext': video_extension.decode('utf-8'),
1487 except UnavailableVideoError:
1488 self._downloader.trouble(u'\nERROR: unable to download video')
1491 class PhotobucketIE(InfoExtractor):
1492 """Information extractor for photobucket.com."""
1494 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1496 def __init__(self, downloader=None):
1497 InfoExtractor.__init__(self, downloader)
1501 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1503 def report_download_webpage(self, video_id):
1504 """Report webpage download."""
1505 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1507 def report_extraction(self, video_id):
1508 """Report information extraction."""
1509 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1511 def _real_initialize(self):
1514 def _real_extract(self, url):
1515 # Extract id from URL
1516 mobj = re.match(self._VALID_URL, url)
1518 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1521 # At this point we have a new video
1522 self._downloader.increment_downloads()
1523 video_id = mobj.group(1)
1525 video_extension = 'flv'
1527 # Retrieve video webpage to extract further information
1528 request = urllib2.Request(url)
1530 self.report_download_webpage(video_id)
1531 webpage = urllib2.urlopen(request).read()
1532 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1533 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1536 # Extract URL, uploader, and title from webpage
1537 self.report_extraction(video_id)
1538 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1540 self._downloader.trouble(u'ERROR: unable to extract media URL')
1542 mediaURL = urllib.unquote(mobj.group(1))
1544 video_url = mediaURL
1546 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1548 self._downloader.trouble(u'ERROR: unable to extract title')
1550 video_title = mobj.group(1).decode('utf-8')
1551 video_title = sanitize_title(video_title)
1552 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1554 video_uploader = mobj.group(2).decode('utf-8')
1557 # Process video information
1558 self._downloader.process_info({
1559 'id': video_id.decode('utf-8'),
1560 'url': video_url.decode('utf-8'),
1561 'uploader': video_uploader,
1562 'upload_date': u'NA',
1563 'title': video_title,
1564 'stitle': simple_title,
1565 'ext': video_extension.decode('utf-8'),
1569 except UnavailableVideoError:
1570 self._downloader.trouble(u'\nERROR: unable to download video')
1573 class YahooIE(InfoExtractor):
1574 """Information extractor for video.yahoo.com."""
1576 # _VALID_URL matches all Yahoo! Video URLs
1577 # _VPAGE_URL matches only the extractable '/watch/' URLs
1578 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1579 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1581 def __init__(self, downloader=None):
1582 InfoExtractor.__init__(self, downloader)
1586 return (re.match(YahooIE._VALID_URL, url) is not None)
1588 def report_download_webpage(self, video_id):
1589 """Report webpage download."""
1590 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1592 def report_extraction(self, video_id):
1593 """Report information extraction."""
1594 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1596 def _real_initialize(self):
1599 def _real_extract(self, url, new_video=True):
1600 # Extract ID from URL
1601 mobj = re.match(self._VALID_URL, url)
1603 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1606 # At this point we have a new video
1607 self._downloader.increment_downloads()
1608 video_id = mobj.group(2)
1609 video_extension = 'flv'
1611 # Rewrite valid but non-extractable URLs as
1612 # extractable English language /watch/ URLs
1613 if re.match(self._VPAGE_URL, url) is None:
1614 request = urllib2.Request(url)
1616 webpage = urllib2.urlopen(request).read()
1617 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1618 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1621 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1623 self._downloader.trouble(u'ERROR: Unable to extract id field')
1625 yahoo_id = mobj.group(1)
1627 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1629 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1631 yahoo_vid = mobj.group(1)
1633 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1634 return self._real_extract(url, new_video=False)
1636 # Retrieve video webpage to extract further information
1637 request = urllib2.Request(url)
1639 self.report_download_webpage(video_id)
1640 webpage = urllib2.urlopen(request).read()
1641 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1645 # Extract uploader and title from webpage
1646 self.report_extraction(video_id)
1647 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1649 self._downloader.trouble(u'ERROR: unable to extract video title')
1651 video_title = mobj.group(1).decode('utf-8')
1652 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1654 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1656 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1658 video_uploader = mobj.group(1).decode('utf-8')
1660 # Extract video thumbnail
1661 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1663 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1665 video_thumbnail = mobj.group(1).decode('utf-8')
1667 # Extract video description
1668 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1670 self._downloader.trouble(u'ERROR: unable to extract video description')
1672 video_description = mobj.group(1).decode('utf-8')
1673 if not video_description: video_description = 'No description available.'
1675 # Extract video height and width
1676 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1678 self._downloader.trouble(u'ERROR: unable to extract video height')
1680 yv_video_height = mobj.group(1)
1682 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1684 self._downloader.trouble(u'ERROR: unable to extract video width')
1686 yv_video_width = mobj.group(1)
1688 # Retrieve video playlist to extract media URL
1689 # I'm not completely sure what all these options are, but we
1690 # seem to need most of them, otherwise the server sends a 401.
1691 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1692 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1693 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1694 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1695 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1697 self.report_download_webpage(video_id)
1698 webpage = urllib2.urlopen(request).read()
1699 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1700 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1703 # Extract media URL from playlist XML
1704 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1706 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1708 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1709 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1712 # Process video information
1713 self._downloader.process_info({
1714 'id': video_id.decode('utf-8'),
1716 'uploader': video_uploader,
1717 'upload_date': u'NA',
1718 'title': video_title,
1719 'stitle': simple_title,
1720 'ext': video_extension.decode('utf-8'),
1721 'thumbnail': video_thumbnail.decode('utf-8'),
1722 'description': video_description,
1723 'thumbnail': video_thumbnail,
1724 'description': video_description,
1727 except UnavailableVideoError:
1728 self._downloader.trouble(u'\nERROR: unable to download video')
1731 class GenericIE(InfoExtractor):
1732 """Generic last-resort information extractor."""
1734 def __init__(self, downloader=None):
1735 InfoExtractor.__init__(self, downloader)
1741 def report_download_webpage(self, video_id):
1742 """Report webpage download."""
1743 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1744 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1746 def report_extraction(self, video_id):
1747 """Report information extraction."""
1748 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1750 def _real_initialize(self):
1753 def _real_extract(self, url):
1754 # At this point we have a new video
1755 self._downloader.increment_downloads()
1757 video_id = url.split('/')[-1]
1758 request = urllib2.Request(url)
1760 self.report_download_webpage(video_id)
1761 webpage = urllib2.urlopen(request).read()
1762 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1763 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1765 except ValueError, err:
1766 # since this is the last-resort InfoExtractor, if
1767 # this error is thrown, it'll be thrown here
1768 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1771 self.report_extraction(video_id)
1772 # Start with something easy: JW Player in SWFObject
1773 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1775 # Broaden the search a little bit
1776 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1778 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1781 # It's possible that one of the regexes
1782 # matched, but returned an empty group:
1783 if mobj.group(1) is None:
1784 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1787 video_url = urllib.unquote(mobj.group(1))
1788 video_id = os.path.basename(video_url)
1790 # here's a fun little line of code for you:
1791 video_extension = os.path.splitext(video_id)[1][1:]
1792 video_id = os.path.splitext(video_id)[0]
1794 # it's tempting to parse this further, but you would
1795 # have to take into account all the variations like
1796 # Video Title - Site Name
1797 # Site Name | Video Title
1798 # Video Title - Tagline | Site Name
1799 # and so on and so forth; it's just not practical
1800 mobj = re.search(r'<title>(.*)</title>', webpage)
1802 self._downloader.trouble(u'ERROR: unable to extract title')
1804 video_title = mobj.group(1).decode('utf-8')
1805 video_title = sanitize_title(video_title)
1806 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1808 # video uploader is domain name
1809 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1811 self._downloader.trouble(u'ERROR: unable to extract title')
1813 video_uploader = mobj.group(1).decode('utf-8')
1816 # Process video information
1817 self._downloader.process_info({
1818 'id': video_id.decode('utf-8'),
1819 'url': video_url.decode('utf-8'),
1820 'uploader': video_uploader,
1821 'upload_date': u'NA',
1822 'title': video_title,
1823 'stitle': simple_title,
1824 'ext': video_extension.decode('utf-8'),
1828 except UnavailableVideoError, err:
1829 self._downloader.trouble(u'\nERROR: unable to download video')
1832 class YoutubeSearchIE(InfoExtractor):
1833 """Information Extractor for YouTube search queries."""
1834 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1835 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1836 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1837 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1839 _max_youtube_results = 1000
1841 def __init__(self, youtube_ie, downloader=None):
1842 InfoExtractor.__init__(self, downloader)
1843 self._youtube_ie = youtube_ie
1847 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1849 def report_download_page(self, query, pagenum):
1850 """Report attempt to download playlist page with given number."""
1851 query = query.decode(preferredencoding())
1852 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1854 def _real_initialize(self):
1855 self._youtube_ie.initialize()
1857 def _real_extract(self, query):
1858 mobj = re.match(self._VALID_QUERY, query)
1860 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1863 prefix, query = query.split(':')
1865 query = query.encode('utf-8')
1867 self._download_n_results(query, 1)
1869 elif prefix == 'all':
1870 self._download_n_results(query, self._max_youtube_results)
1876 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1878 elif n > self._max_youtube_results:
1879 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1880 n = self._max_youtube_results
1881 self._download_n_results(query, n)
1883 except ValueError: # parsing prefix as integer fails
1884 self._download_n_results(query, 1)
1887 def _download_n_results(self, query, n):
1888 """Downloads a specified number of results for a query"""
1891 already_seen = set()
1895 self.report_download_page(query, pagenum)
1896 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1897 request = urllib2.Request(result_url)
1899 page = urllib2.urlopen(request).read()
1900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1901 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1904 # Extract video identifiers
1905 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1906 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1907 if video_id not in already_seen:
1908 video_ids.append(video_id)
1909 already_seen.add(video_id)
1910 if len(video_ids) == n:
1911 # Specified n videos reached
1912 for id in video_ids:
1913 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1916 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1917 for id in video_ids:
1918 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1921 pagenum = pagenum + 1
1923 class GoogleSearchIE(InfoExtractor):
1924 """Information Extractor for Google Video search queries."""
1925 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1926 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1927 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1928 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1930 _max_google_results = 1000
1932 def __init__(self, google_ie, downloader=None):
1933 InfoExtractor.__init__(self, downloader)
1934 self._google_ie = google_ie
1938 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1940 def report_download_page(self, query, pagenum):
1941 """Report attempt to download playlist page with given number."""
1942 query = query.decode(preferredencoding())
1943 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1945 def _real_initialize(self):
1946 self._google_ie.initialize()
1948 def _real_extract(self, query):
1949 mobj = re.match(self._VALID_QUERY, query)
1951 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1954 prefix, query = query.split(':')
1956 query = query.encode('utf-8')
1958 self._download_n_results(query, 1)
1960 elif prefix == 'all':
1961 self._download_n_results(query, self._max_google_results)
1967 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1969 elif n > self._max_google_results:
1970 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1971 n = self._max_google_results
1972 self._download_n_results(query, n)
1974 except ValueError: # parsing prefix as integer fails
1975 self._download_n_results(query, 1)
1978 def _download_n_results(self, query, n):
1979 """Downloads a specified number of results for a query"""
1982 already_seen = set()
1986 self.report_download_page(query, pagenum)
1987 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1988 request = urllib2.Request(result_url)
1990 page = urllib2.urlopen(request).read()
1991 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1992 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1995 # Extract video identifiers
1996 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1997 video_id = mobj.group(1)
1998 if video_id not in already_seen:
1999 video_ids.append(video_id)
2000 already_seen.add(video_id)
2001 if len(video_ids) == n:
2002 # Specified n videos reached
2003 for id in video_ids:
2004 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2007 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2008 for id in video_ids:
2009 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2012 pagenum = pagenum + 1
2014 class YahooSearchIE(InfoExtractor):
2015 """Information Extractor for Yahoo! Video search queries."""
2016 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2017 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2018 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2019 _MORE_PAGES_INDICATOR = r'\s*Next'
2021 _max_yahoo_results = 1000
2023 def __init__(self, yahoo_ie, downloader=None):
2024 InfoExtractor.__init__(self, downloader)
2025 self._yahoo_ie = yahoo_ie
2029 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2031 def report_download_page(self, query, pagenum):
2032 """Report attempt to download playlist page with given number."""
2033 query = query.decode(preferredencoding())
2034 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2036 def _real_initialize(self):
2037 self._yahoo_ie.initialize()
2039 def _real_extract(self, query):
2040 mobj = re.match(self._VALID_QUERY, query)
2042 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2045 prefix, query = query.split(':')
2047 query = query.encode('utf-8')
2049 self._download_n_results(query, 1)
2051 elif prefix == 'all':
2052 self._download_n_results(query, self._max_yahoo_results)
2058 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2060 elif n > self._max_yahoo_results:
2061 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2062 n = self._max_yahoo_results
2063 self._download_n_results(query, n)
2065 except ValueError: # parsing prefix as integer fails
2066 self._download_n_results(query, 1)
2069 def _download_n_results(self, query, n):
2070 """Downloads a specified number of results for a query"""
2073 already_seen = set()
2077 self.report_download_page(query, pagenum)
2078 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2079 request = urllib2.Request(result_url)
2081 page = urllib2.urlopen(request).read()
2082 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2083 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2086 # Extract video identifiers
2087 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2088 video_id = mobj.group(1)
2089 if video_id not in already_seen:
2090 video_ids.append(video_id)
2091 already_seen.add(video_id)
2092 if len(video_ids) == n:
2093 # Specified n videos reached
2094 for id in video_ids:
2095 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2098 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2099 for id in video_ids:
2100 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2103 pagenum = pagenum + 1
2105 class YoutubePlaylistIE(InfoExtractor):
2106 """Information Extractor for YouTube playlists."""
2108 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist)\?.*?(p|a)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2109 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2110 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2111 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2114 def __init__(self, youtube_ie, downloader=None):
2115 InfoExtractor.__init__(self, downloader)
2116 self._youtube_ie = youtube_ie
2120 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2122 def report_download_page(self, playlist_id, pagenum):
2123 """Report attempt to download playlist page with given number."""
2124 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2126 def _real_initialize(self):
2127 self._youtube_ie.initialize()
2129 def _real_extract(self, url):
2130 # Extract playlist id
2131 mobj = re.match(self._VALID_URL, url)
2133 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2137 if mobj.group(3) is not None:
2138 self._youtube_ie.extract(mobj.group(3))
2141 # Download playlist pages
2142 # prefix is 'p' as default for playlists but there are other types that need extra care
2143 playlist_prefix = mobj.group(1)
2144 if playlist_prefix == 'a':
2145 playlist_access = 'artist'
2147 playlist_prefix = 'p'
2148 playlist_access = 'view_play_list'
2149 playlist_id = mobj.group(2)
2154 self.report_download_page(playlist_id, pagenum)
2155 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2157 page = urllib2.urlopen(request).read()
2158 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2159 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2162 # Extract video identifiers
2164 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2165 if mobj.group(1) not in ids_in_page:
2166 ids_in_page.append(mobj.group(1))
2167 video_ids.extend(ids_in_page)
2169 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2171 pagenum = pagenum + 1
2173 playliststart = self._downloader.params.get('playliststart', 1) - 1
2174 playlistend = self._downloader.params.get('playlistend', -1)
2175 video_ids = video_ids[playliststart:playlistend]
2177 for id in video_ids:
2178 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2181 class YoutubeUserIE(InfoExtractor):
2182 """Information Extractor for YouTube users."""
2184 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2185 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2186 _GDATA_PAGE_SIZE = 50
2187 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2188 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2191 def __init__(self, youtube_ie, downloader=None):
2192 InfoExtractor.__init__(self, downloader)
2193 self._youtube_ie = youtube_ie
2197 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2199 def report_download_page(self, username, start_index):
2200 """Report attempt to download user page."""
2201 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2202 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2204 def _real_initialize(self):
2205 self._youtube_ie.initialize()
2207 def _real_extract(self, url):
2209 mobj = re.match(self._VALID_URL, url)
2211 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2214 username = mobj.group(1)
2216 # Download video ids using YouTube Data API. Result size per
2217 # query is limited (currently to 50 videos) so we need to query
2218 # page by page until there are no video ids - it means we got
2225 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2226 self.report_download_page(username, start_index)
2228 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2231 page = urllib2.urlopen(request).read()
2232 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2233 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2236 # Extract video identifiers
2239 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2240 if mobj.group(1) not in ids_in_page:
2241 ids_in_page.append(mobj.group(1))
2243 video_ids.extend(ids_in_page)
2245 # A little optimization - if current page is not
2246 # "full", ie. does not contain PAGE_SIZE video ids then
2247 # we can assume that this page is the last one - there
2248 # are no more ids on further pages - no need to query
2251 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2256 all_ids_count = len(video_ids)
2257 playliststart = self._downloader.params.get('playliststart', 1) - 1
2258 playlistend = self._downloader.params.get('playlistend', -1)
2260 if playlistend == -1:
2261 video_ids = video_ids[playliststart:]
2263 video_ids = video_ids[playliststart:playlistend]
2265 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2266 (username, all_ids_count, len(video_ids)))
2268 for video_id in video_ids:
2269 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2272 class DepositFilesIE(InfoExtractor):
2273 """Information extractor for depositfiles.com"""
2275 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2282 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2284 def report_download_webpage(self, file_id):
2285 """Report webpage download."""
2286 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2288 def report_extraction(self, file_id):
2289 """Report information extraction."""
2290 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2292 def _real_initialize(self):
2295 def _real_extract(self, url):
2296 # At this point we have a new file
2297 self._downloader.increment_downloads()
2299 file_id = url.split('/')[-1]
2300 # Rebuild url in english locale
2301 url = 'http://depositfiles.com/en/files/' + file_id
2303 # Retrieve file webpage with 'Free download' button pressed
2304 free_download_indication = { 'gateway_result' : '1' }
2305 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2307 self.report_download_webpage(file_id)
2308 webpage = urllib2.urlopen(request).read()
2309 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2310 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2313 # Search for the real file URL
2314 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2315 if (mobj is None) or (mobj.group(1) is None):
2316 # Try to figure out reason of the error.
2317 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2318 if (mobj is not None) and (mobj.group(1) is not None):
2319 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2320 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2322 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2325 file_url = mobj.group(1)
2326 file_extension = os.path.splitext(file_url)[1][1:]
2328 # Search for file title
2329 mobj = re.search(r'<b title="(.*?)">', webpage)
2331 self._downloader.trouble(u'ERROR: unable to extract title')
2333 file_title = mobj.group(1).decode('utf-8')
2336 # Process file information
2337 self._downloader.process_info({
2338 'id': file_id.decode('utf-8'),
2339 'url': file_url.decode('utf-8'),
2341 'upload_date': u'NA',
2342 'title': file_title,
2343 'stitle': file_title,
2344 'ext': file_extension.decode('utf-8'),
2348 except UnavailableVideoError, err:
2349 self._downloader.trouble(u'ERROR: unable to download file')
2351 class FacebookIE(InfoExtractor):
2352 """Information Extractor for Facebook"""
2354 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2355 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2356 _NETRC_MACHINE = 'facebook'
2357 _available_formats = ['highqual', 'lowqual']
2358 _video_extensions = {
2363 def __init__(self, downloader=None):
2364 InfoExtractor.__init__(self, downloader)
2368 return (re.match(FacebookIE._VALID_URL, url) is not None)
2370 def _reporter(self, message):
2371 """Add header and report message."""
2372 self._downloader.to_screen(u'[facebook] %s' % message)
2374 def report_login(self):
2375 """Report attempt to log in."""
2376 self._reporter(u'Logging in')
2378 def report_video_webpage_download(self, video_id):
2379 """Report attempt to download video webpage."""
2380 self._reporter(u'%s: Downloading video webpage' % video_id)
2382 def report_information_extraction(self, video_id):
2383 """Report attempt to extract video information."""
2384 self._reporter(u'%s: Extracting video information' % video_id)
2386 def _parse_page(self, video_webpage):
2387 """Extract video information from page"""
2389 data = {'title': r'class="video_title datawrap">(.*?)</',
2390 'description': r'<div class="datawrap">(.*?)</div>',
2391 'owner': r'\("video_owner_name", "(.*?)"\)',
2392 'upload_date': r'data-date="(.*?)"',
2393 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2396 for piece in data.keys():
2397 mobj = re.search(data[piece], video_webpage)
2398 if mobj is not None:
2399 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2403 for fmt in self._available_formats:
2404 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2405 if mobj is not None:
2406 # URL is in a Javascript segment inside an escaped Unicode format within
2407 # the generally utf-8 page
2408 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2409 video_info['video_urls'] = video_urls
2413 def _real_initialize(self):
2414 if self._downloader is None:
2419 downloader_params = self._downloader.params
2421 # Attempt to use provided username and password or .netrc data
2422 if downloader_params.get('username', None) is not None:
2423 useremail = downloader_params['username']
2424 password = downloader_params['password']
2425 elif downloader_params.get('usenetrc', False):
2427 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2428 if info is not None:
2432 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2433 except (IOError, netrc.NetrcParseError), err:
2434 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2437 if useremail is None:
2446 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2449 login_results = urllib2.urlopen(request).read()
2450 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2451 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2453 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2454 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2457 def _real_extract(self, url):
2458 mobj = re.match(self._VALID_URL, url)
2460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2462 video_id = mobj.group('ID')
2465 self.report_video_webpage_download(video_id)
2466 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2468 page = urllib2.urlopen(request)
2469 video_webpage = page.read()
2470 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2471 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2474 # Start extracting information
2475 self.report_information_extraction(video_id)
2477 # Extract information
2478 video_info = self._parse_page(video_webpage)
2481 if 'owner' not in video_info:
2482 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2484 video_uploader = video_info['owner']
2487 if 'title' not in video_info:
2488 self._downloader.trouble(u'ERROR: unable to extract video title')
2490 video_title = video_info['title']
2491 video_title = video_title.decode('utf-8')
2492 video_title = sanitize_title(video_title)
2495 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2496 simple_title = simple_title.strip(ur'_')
2499 if 'thumbnail' not in video_info:
2500 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2501 video_thumbnail = ''
2503 video_thumbnail = video_info['thumbnail']
2507 if 'upload_date' in video_info:
2508 upload_time = video_info['upload_date']
2509 timetuple = email.utils.parsedate_tz(upload_time)
2510 if timetuple is not None:
2512 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2517 video_description = 'No description available.'
2518 if (self._downloader.params.get('forcedescription', False) and
2519 'description' in video_info):
2520 video_description = video_info['description']
2522 url_map = video_info['video_urls']
2523 if len(url_map.keys()) > 0:
2524 # Decide which formats to download
2525 req_format = self._downloader.params.get('format', None)
2526 format_limit = self._downloader.params.get('format_limit', None)
2528 if format_limit is not None and format_limit in self._available_formats:
2529 format_list = self._available_formats[self._available_formats.index(format_limit):]
2531 format_list = self._available_formats
2532 existing_formats = [x for x in format_list if x in url_map]
2533 if len(existing_formats) == 0:
2534 self._downloader.trouble(u'ERROR: no known formats available for video')
2536 if req_format is None:
2537 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2538 elif req_format == '-1':
2539 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2542 if req_format not in url_map:
2543 self._downloader.trouble(u'ERROR: requested format not available')
2545 video_url_list = [(req_format, url_map[req_format])] # Specific format
2547 for format_param, video_real_url in video_url_list:
2549 # At this point we have a new video
2550 self._downloader.increment_downloads()
2553 video_extension = self._video_extensions.get(format_param, 'mp4')
2555 # Find the video URL in fmt_url_map or conn paramters
2557 # Process video information
2558 self._downloader.process_info({
2559 'id': video_id.decode('utf-8'),
2560 'url': video_real_url.decode('utf-8'),
2561 'uploader': video_uploader.decode('utf-8'),
2562 'upload_date': upload_date,
2563 'title': video_title,
2564 'stitle': simple_title,
2565 'ext': video_extension.decode('utf-8'),
2566 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2567 'thumbnail': video_thumbnail.decode('utf-8'),
2568 'description': video_description.decode('utf-8'),
2571 except UnavailableVideoError, err:
2572 self._downloader.trouble(u'\nERROR: unable to download video')
2574 class PostProcessor(object):
2575 """Post Processor class.
2577 PostProcessor objects can be added to downloaders with their
2578 add_post_processor() method. When the downloader has finished a
2579 successful download, it will take its internal chain of PostProcessors
2580 and start calling the run() method on each one of them, first with
2581 an initial argument and then with the returned value of the previous
2584 The chain will be stopped if one of them ever returns None or the end
2585 of the chain is reached.
2587 PostProcessor objects follow a "mutual registration" process similar
2588 to InfoExtractor objects.
2593 def __init__(self, downloader=None):
2594 self._downloader = downloader
2596 def set_downloader(self, downloader):
2597 """Sets the downloader for this PP."""
2598 self._downloader = downloader
2600 def run(self, information):
2601 """Run the PostProcessor.
2603 The "information" argument is a dictionary like the ones
2604 composed by InfoExtractors. The only difference is that this
2605 one has an extra field called "filepath" that points to the
2608 When this method returns None, the postprocessing chain is
2609 stopped. However, this method may return an information
2610 dictionary that will be passed to the next postprocessing
2611 object in the chain. It can be the one it received after
2612 changing some fields.
2614 In addition, this method may raise a PostProcessingError
2615 exception that will be taken into account by the downloader
2618 return information # by default, do nothing
2620 class FFmpegExtractAudioPP(PostProcessor):
2622 def __init__(self, downloader=None, preferredcodec=None):
2623 PostProcessor.__init__(self, downloader)
2624 if preferredcodec is None:
2625 preferredcodec = 'best'
2626 self._preferredcodec = preferredcodec
2629 def get_audio_codec(path):
2631 cmd = ['ffprobe', '-show_streams', '--', path]
2632 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
2633 output = handle.communicate()[0]
2634 if handle.wait() != 0:
2636 except (IOError, OSError):
2639 for line in output.split('\n'):
2640 if line.startswith('codec_name='):
2641 audio_codec = line.split('=')[1].strip()
2642 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
2647 def run_ffmpeg(path, out_path, codec, more_opts):
2649 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
2650 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
2652 except (IOError, OSError):
2655 def run(self, information):
2656 path = information['filepath']
2658 filecodec = self.get_audio_codec(path)
2659 if filecodec is None:
2660 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
2664 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
2665 if filecodec == 'aac' or filecodec == 'mp3':
2666 # Lossless if possible
2668 extension = filecodec
2669 if filecodec == 'aac':
2670 more_opts = ['-f', 'adts']
2673 acodec = 'libmp3lame'
2675 more_opts = ['-ab', '128k']
2677 # We convert the audio (lossy)
2678 acodec = {'mp3': 'libmp3lame', 'aac': 'aac'}[self._preferredcodec]
2679 extension = self._preferredcodec
2680 more_opts = ['-ab', '128k']
2681 if self._preferredcodec == 'aac':
2682 more_opts += ['-f', 'adts']
2684 (prefix, ext) = os.path.splitext(path)
2685 new_path = prefix + '.' + extension
2686 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
2687 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
2690 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
2695 except (IOError, OSError):
2696 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
2699 information['filepath'] = new_path
2703 def updateSelf(downloader, filename):
2704 ''' Update the program file with the latest version from the repository '''
2705 # Note: downloader only used for options
2706 if not os.access(filename, os.W_OK):
2707 sys.exit('ERROR: no write permissions on %s' % filename)
2709 downloader.to_screen('Updating to latest stable version...')
2712 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2713 latest_version = urllib.urlopen(latest_url).read().strip()
2714 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2715 newcontent = urllib.urlopen(prog_url).read()
2716 except (IOError, OSError), err:
2717 sys.exit('ERROR: unable to download latest version')
2720 stream = open(filename, 'w')
2721 stream.write(newcontent)
2723 except (IOError, OSError), err:
2724 sys.exit('ERROR: unable to overwrite current version')
2726 downloader.to_screen('Updated to version %s' % latest_version)
2734 def _format_option_string(option):
2735 ''' ('-o', '--option') -> -o, --format METAVAR'''
2739 if option._short_opts: opts.append(option._short_opts[0])
2740 if option._long_opts: opts.append(option._long_opts[0])
2741 if len(opts) > 1: opts.insert(1, ', ')
2743 if option.takes_value(): opts.append(' %s' % option.metavar)
2745 return "".join(opts)
2747 fmt = optparse.IndentedHelpFormatter()
2748 fmt.format_option_strings = _format_option_string
2751 'version' : __version__,
2753 'usage' : 'Usage : %prog [options] url...',
2754 'conflict_handler' : 'resolve',
2757 parser = optparse.OptionParser(**kw)
2760 general = optparse.OptionGroup(parser, 'General Options')
2761 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2762 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2763 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
2764 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2765 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2767 general.add_option('-h', '--help',
2768 action='help', help='print this help text and exit')
2769 general.add_option('-v', '--version',
2770 action='version', help='print program version and exit')
2771 general.add_option('-U', '--update',
2772 action='store_true', dest='update_self', help='update this program to latest stable version')
2773 general.add_option('-i', '--ignore-errors',
2774 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2775 general.add_option('-r', '--rate-limit',
2776 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2777 general.add_option('-R', '--retries',
2778 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2779 general.add_option('--playlist-start',
2780 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2781 general.add_option('--playlist-end',
2782 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2783 general.add_option('--dump-user-agent',
2784 action='store_true', dest='dump_user_agent',
2785 help='display the current browser identification', default=False)
2787 authentication.add_option('-u', '--username',
2788 dest='username', metavar='USERNAME', help='account username')
2789 authentication.add_option('-p', '--password',
2790 dest='password', metavar='PASSWORD', help='account password')
2791 authentication.add_option('-n', '--netrc',
2792 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2795 video_format.add_option('-f', '--format',
2796 action='store', dest='format', metavar='FORMAT', help='video format code')
2797 video_format.add_option('--all-formats',
2798 action='store_const', dest='format', help='download all available video formats', const='-1')
2799 video_format.add_option('--max-quality',
2800 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2803 verbosity.add_option('-q', '--quiet',
2804 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2805 verbosity.add_option('-s', '--simulate',
2806 action='store_true', dest='simulate', help='do not download video', default=False)
2807 verbosity.add_option('-g', '--get-url',
2808 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2809 verbosity.add_option('-e', '--get-title',
2810 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2811 verbosity.add_option('--get-thumbnail',
2812 action='store_true', dest='getthumbnail',
2813 help='simulate, quiet but print thumbnail URL', default=False)
2814 verbosity.add_option('--get-description',
2815 action='store_true', dest='getdescription',
2816 help='simulate, quiet but print video description', default=False)
2817 verbosity.add_option('--get-filename',
2818 action='store_true', dest='getfilename',
2819 help='simulate, quiet but print output filename', default=False)
2820 verbosity.add_option('--no-progress',
2821 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2822 verbosity.add_option('--console-title',
2823 action='store_true', dest='consoletitle',
2824 help='display progress in console titlebar', default=False)
2827 filesystem.add_option('-t', '--title',
2828 action='store_true', dest='usetitle', help='use title in file name', default=False)
2829 filesystem.add_option('-l', '--literal',
2830 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2831 filesystem.add_option('-A', '--auto-number',
2832 action='store_true', dest='autonumber',
2833 help='number downloaded files starting from 00000', default=False)
2834 filesystem.add_option('-o', '--output',
2835 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2836 filesystem.add_option('-a', '--batch-file',
2837 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2838 filesystem.add_option('-w', '--no-overwrites',
2839 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2840 filesystem.add_option('-c', '--continue',
2841 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2842 filesystem.add_option('--cookies',
2843 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2844 filesystem.add_option('--no-part',
2845 action='store_true', dest='nopart', help='do not use .part files', default=False)
2846 filesystem.add_option('--no-mtime',
2847 action='store_false', dest='updatetime',
2848 help='do not use the Last-modified header to set the file modification time', default=True)
2851 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
2852 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
2853 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
2854 help='"best", "aac" or "mp3"; best by default')
2857 parser.add_option_group(general)
2858 parser.add_option_group(filesystem)
2859 parser.add_option_group(verbosity)
2860 parser.add_option_group(video_format)
2861 parser.add_option_group(authentication)
2862 parser.add_option_group(postproc)
2864 opts, args = parser.parse_args()
2866 return parser, opts, args
2869 if __name__ == '__main__':
2871 parser, opts, args = parseOpts()
2873 # Open appropriate CookieJar
2874 if opts.cookiefile is None:
2875 jar = cookielib.CookieJar()
2878 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2879 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2881 except (IOError, OSError), err:
2882 sys.exit(u'ERROR: unable to open cookie file')
2885 if opts.dump_user_agent:
2886 print std_headers['User-Agent']
2889 # General configuration
2890 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2891 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2892 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2894 # Batch file verification
2896 if opts.batchfile is not None:
2898 if opts.batchfile == '-':
2901 batchfd = open(opts.batchfile, 'r')
2902 batchurls = batchfd.readlines()
2903 batchurls = [x.strip() for x in batchurls]
2904 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2906 sys.exit(u'ERROR: batch file could not be read')
2907 all_urls = batchurls + args
2909 # Conflicting, missing and erroneous options
2910 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2911 parser.error(u'using .netrc conflicts with giving username/password')
2912 if opts.password is not None and opts.username is None:
2913 parser.error(u'account username missing')
2914 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2915 parser.error(u'using output template conflicts with using title, literal title or auto number')
2916 if opts.usetitle and opts.useliteral:
2917 parser.error(u'using title conflicts with using literal title')
2918 if opts.username is not None and opts.password is None:
2919 opts.password = getpass.getpass(u'Type account password and press return:')
2920 if opts.ratelimit is not None:
2921 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2922 if numeric_limit is None:
2923 parser.error(u'invalid rate limit specified')
2924 opts.ratelimit = numeric_limit
2925 if opts.retries is not None:
2927 opts.retries = long(opts.retries)
2928 except (TypeError, ValueError), err:
2929 parser.error(u'invalid retry count specified')
2931 opts.playliststart = long(opts.playliststart)
2932 if opts.playliststart <= 0:
2934 except (TypeError, ValueError), err:
2935 parser.error(u'invalid playlist start number specified')
2937 opts.playlistend = long(opts.playlistend)
2938 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2940 except (TypeError, ValueError), err:
2941 parser.error(u'invalid playlist end number specified')
2942 if opts.extractaudio:
2943 if opts.audioformat not in ['best', 'aac', 'mp3']:
2944 parser.error(u'invalid audio format specified')
2946 # Information extractors
2947 youtube_ie = YoutubeIE()
2948 metacafe_ie = MetacafeIE(youtube_ie)
2949 dailymotion_ie = DailymotionIE()
2950 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2951 youtube_user_ie = YoutubeUserIE(youtube_ie)
2952 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2953 google_ie = GoogleIE()
2954 google_search_ie = GoogleSearchIE(google_ie)
2955 photobucket_ie = PhotobucketIE()
2956 yahoo_ie = YahooIE()
2957 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2958 deposit_files_ie = DepositFilesIE()
2959 facebook_ie = FacebookIE()
2960 generic_ie = GenericIE()
2963 fd = FileDownloader({
2964 'usenetrc': opts.usenetrc,
2965 'username': opts.username,
2966 'password': opts.password,
2967 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2968 'forceurl': opts.geturl,
2969 'forcetitle': opts.gettitle,
2970 'forcethumbnail': opts.getthumbnail,
2971 'forcedescription': opts.getdescription,
2972 'forcefilename': opts.getfilename,
2973 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename),
2974 'format': opts.format,
2975 'format_limit': opts.format_limit,
2976 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2977 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2978 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2979 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2980 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2981 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2982 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2983 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2984 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2985 or u'%(id)s.%(ext)s'),
2986 'ignoreerrors': opts.ignoreerrors,
2987 'ratelimit': opts.ratelimit,
2988 'nooverwrites': opts.nooverwrites,
2989 'retries': opts.retries,
2990 'continuedl': opts.continue_dl,
2991 'noprogress': opts.noprogress,
2992 'playliststart': opts.playliststart,
2993 'playlistend': opts.playlistend,
2994 'logtostderr': opts.outtmpl == '-',
2995 'consoletitle': opts.consoletitle,
2996 'nopart': opts.nopart,
2997 'updatetime': opts.updatetime,
2999 fd.add_info_extractor(youtube_search_ie)
3000 fd.add_info_extractor(youtube_pl_ie)
3001 fd.add_info_extractor(youtube_user_ie)
3002 fd.add_info_extractor(metacafe_ie)
3003 fd.add_info_extractor(dailymotion_ie)
3004 fd.add_info_extractor(youtube_ie)
3005 fd.add_info_extractor(google_ie)
3006 fd.add_info_extractor(google_search_ie)
3007 fd.add_info_extractor(photobucket_ie)
3008 fd.add_info_extractor(yahoo_ie)
3009 fd.add_info_extractor(yahoo_search_ie)
3010 fd.add_info_extractor(deposit_files_ie)
3011 fd.add_info_extractor(facebook_ie)
3013 # This must come last since it's the
3014 # fallback if none of the others work
3015 fd.add_info_extractor(generic_ie)
3018 if opts.extractaudio:
3019 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat))
3022 if opts.update_self:
3023 updateSelf(fd, sys.argv[0])
3026 if len(all_urls) < 1:
3027 if not opts.update_self:
3028 parser.error(u'you must provide at least one URL')
3031 retcode = fd.download(all_urls)
3033 # Dump cookie jar if requested
3034 if opts.cookiefile is not None:
3037 except (IOError, OSError), err:
3038 sys.exit(u'ERROR: unable to save cookie jar')
3042 except DownloadError:
3044 except SameFileError:
3045 sys.exit(u'ERROR: fixed output name but more than one file to download')
3046 except KeyboardInterrupt:
3047 sys.exit(u'\nERROR: Interrupted by user')
3049 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: