2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
30 # parse_qs was moved from the cgi module to the urlparse module recently.
32 from urlparse import parse_qs
34 from cgi import parse_qs
37 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
38 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
39 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40 'Accept-Encoding': 'gzip, deflate',
41 'Accept-Language': 'en-us,en;q=0.5',
44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
46 def preferredencoding():
47 """Get preferred encoding.
49 Returns the best encoding scheme for the system, based on
50 locale.getpreferredencoding() and some further tweaks.
52 def yield_preferredencoding():
54 pref = locale.getpreferredencoding()
60 return yield_preferredencoding().next()
62 def htmlentity_transform(matchobj):
63 """Transforms an HTML entity to a Unicode character.
65 This function receives a match object and is intended to be used with
66 the re.sub() function.
68 entity = matchobj.group(1)
70 # Known non-numeric HTML entity
71 if entity in htmlentitydefs.name2codepoint:
72 return unichr(htmlentitydefs.name2codepoint[entity])
75 mobj = re.match(ur'(?u)#(x?\d+)', entity)
77 numstr = mobj.group(1)
78 if numstr.startswith(u'x'):
80 numstr = u'0%s' % numstr
83 return unichr(long(numstr, base))
85 # Unknown entity in name, return its literal representation
86 return (u'&%s;' % entity)
88 def sanitize_title(utitle):
89 """Sanitizes a video title so it could be used as part of a filename."""
90 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
91 return utitle.replace(unicode(os.sep), u'%')
93 def sanitize_open(filename, open_mode):
94 """Try to open the given filename, and slightly tweak it if this fails.
96 Attempts to open the given filename. If this fails, it tries to change
97 the filename slightly, step by step, until it's either able to open it
98 or it fails and raises a final exception, like the standard open()
101 It returns the tuple (stream, definitive_file_name).
105 if sys.platform == 'win32':
107 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
108 return (sys.stdout, filename)
109 stream = open(filename, open_mode)
110 return (stream, filename)
111 except (IOError, OSError), err:
112 # In case of error, try to remove win32 forbidden chars
113 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
115 # An exception here should be caught in the caller
116 stream = open(filename, open_mode)
117 return (stream, filename)
119 class DownloadError(Exception):
120 """Download Error exception.
122 This exception may be thrown by FileDownloader objects if they are not
123 configured to continue on errors. They will contain the appropriate
128 class SameFileError(Exception):
129 """Same File exception.
131 This exception will be thrown by FileDownloader objects if they detect
132 multiple files would have to be downloaded to the same file on disk.
136 class PostProcessingError(Exception):
137 """Post Processing exception.
139 This exception may be raised by PostProcessor's .run() method to
140 indicate an error in the postprocessing task.
144 class UnavailableVideoError(Exception):
145 """Unavailable Format exception.
147 This exception will be thrown when a video is requested
148 in a format that is not available for that video.
152 class ContentTooShortError(Exception):
153 """Content Too Short exception.
155 This exception may be raised by FileDownloader objects when a file they
156 download is too small for what the server announced first, indicating
157 the connection was probably interrupted.
163 def __init__(self, downloaded, expected):
164 self.downloaded = downloaded
165 self.expected = expected
167 class YoutubeDLHandler(urllib2.HTTPHandler):
168 """Handler for HTTP requests and responses.
170 This class, when installed with an OpenerDirector, automatically adds
171 the standard headers to every HTTP request and handles gzipped and
172 deflated responses from web servers. If compression is to be avoided in
173 a particular request, the original request in the program code only has
174 to include the HTTP header "Youtubedl-No-Compression", which will be
175 removed before making the real request.
177 Part of this code was copied from:
179 http://techknack.net/python-urllib2-handlers/
181 Andrew Rowls, the author of that code, agreed to release it to the
188 return zlib.decompress(data, -zlib.MAX_WBITS)
190 return zlib.decompress(data)
192 def http_request(self, req):
193 for h in std_headers:
196 req.add_header(h, std_headers[h])
197 if 'Youtubedl-no-compression' in req.headers:
198 if 'Accept-encoding' in req.headers:
199 del req.headers['Accept-encoding']
200 del req.headers['Youtubedl-no-compression']
203 def http_response(self, req, resp):
206 if resp.headers.get('Content-encoding', '') == 'gzip':
207 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
208 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
209 resp.msg = old_resp.msg
211 if resp.headers.get('Content-encoding', '') == 'deflate':
212 gz = StringIO.StringIO(self.deflate(resp.read()))
213 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
214 resp.msg = old_resp.msg
217 class FileDownloader(object):
218 """File Downloader class.
220 File downloader objects are the ones responsible of downloading the
221 actual video file and writing it to disk if the user has requested
222 it, among some other tasks. In most cases there should be one per
223 program. As, given a video URL, the downloader doesn't know how to
224 extract all the needed information, task that InfoExtractors do, it
225 has to pass the URL to one of them.
227 For this, file downloader objects have a method that allows
228 InfoExtractors to be registered in a given order. When it is passed
229 a URL, the file downloader handles it to the first InfoExtractor it
230 finds that reports being able to handle it. The InfoExtractor extracts
231 all the information about the video or videos the URL refers to, and
232 asks the FileDownloader to process the video information, possibly
233 downloading the video.
235 File downloaders accept a lot of parameters. In order not to saturate
236 the object constructor with arguments, it receives a dictionary of
237 options instead. These options are available through the params
238 attribute for the InfoExtractors to use. The FileDownloader also
239 registers itself as the downloader in charge for the InfoExtractors
240 that are added to it, so this is a "mutual registration".
244 username: Username for authentication purposes.
245 password: Password for authentication purposes.
246 usenetrc: Use netrc for authentication instead.
247 quiet: Do not print messages to stdout.
248 forceurl: Force printing final URL.
249 forcetitle: Force printing title.
250 forcethumbnail: Force printing thumbnail URL.
251 forcedescription: Force printing description.
252 simulate: Do not download the video files.
253 format: Video format code.
254 format_limit: Highest quality format to try.
255 outtmpl: Template for output names.
256 ignoreerrors: Do not stop on download errors.
257 ratelimit: Download speed limit, in bytes/sec.
258 nooverwrites: Prevent overwriting files.
259 retries: Number of times to retry for HTTP error 5xx
260 continuedl: Try to continue downloads if possible.
261 noprogress: Do not print the progress bar.
262 playliststart: Playlist item to start at.
263 playlistend: Playlist item to end at.
264 logtostderr: Log messages to stderr instead of stdout.
265 consoletitle: Display progress in console window's titlebar.
266 nopart: Do not use temporary .part files.
272 _download_retcode = None
273 _num_downloads = None
276 def __init__(self, params):
277 """Create a FileDownloader object with the given options."""
280 self._download_retcode = 0
281 self._num_downloads = 0
282 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
286 def pmkdir(filename):
287 """Create directory components in filename. Similar to Unix "mkdir -p"."""
288 components = filename.split(os.sep)
289 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
290 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
291 for dir in aggregate:
292 if not os.path.exists(dir):
296 def format_bytes(bytes):
299 if type(bytes) is str:
304 exponent = long(math.log(bytes, 1024.0))
305 suffix = 'bkMGTPEZY'[exponent]
306 converted = float(bytes) / float(1024**exponent)
307 return '%.2f%s' % (converted, suffix)
310 def calc_percent(byte_counter, data_len):
313 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
316 def calc_eta(start, now, total, current):
320 if current == 0 or dif < 0.001: # One millisecond
322 rate = float(current) / dif
323 eta = long((float(total) - float(current)) / rate)
324 (eta_mins, eta_secs) = divmod(eta, 60)
327 return '%02d:%02d' % (eta_mins, eta_secs)
330 def calc_speed(start, now, bytes):
332 if bytes == 0 or dif < 0.001: # One millisecond
333 return '%10s' % '---b/s'
334 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
337 def best_block_size(elapsed_time, bytes):
338 new_min = max(bytes / 2.0, 1.0)
339 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
340 if elapsed_time < 0.001:
342 rate = bytes / elapsed_time
350 def parse_bytes(bytestr):
351 """Parse a string indicating a byte quantity into a long integer."""
352 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
355 number = float(matchobj.group(1))
356 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
357 return long(round(number * multiplier))
359 def add_info_extractor(self, ie):
360 """Add an InfoExtractor object to the end of the list."""
362 ie.set_downloader(self)
364 def add_post_processor(self, pp):
365 """Add a PostProcessor object to the end of the chain."""
367 pp.set_downloader(self)
369 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
370 """Print message to stdout if not in quiet mode."""
372 if not self.params.get('quiet', False):
373 terminator = [u'\n', u''][skip_eol]
374 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
375 self._screen_file.flush()
376 except (UnicodeEncodeError), err:
377 if not ignore_encoding_errors:
380 def to_stderr(self, message):
381 """Print message to stderr."""
382 print >>sys.stderr, message.encode(preferredencoding())
384 def to_cons_title(self, message):
385 """Set console/terminal window title to message."""
386 if not self.params.get('consoletitle', False):
388 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
389 # c_wchar_p() might not be necessary if `message` is
390 # already of type unicode()
391 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
392 elif 'TERM' in os.environ:
393 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
395 def fixed_template(self):
396 """Checks if the output template is fixed."""
397 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
399 def trouble(self, message=None):
400 """Determine action to take when a download problem appears.
402 Depending on if the downloader has been configured to ignore
403 download errors or not, this method may throw an exception or
404 not when errors are found, after printing the message.
406 if message is not None:
407 self.to_stderr(message)
408 if not self.params.get('ignoreerrors', False):
409 raise DownloadError(message)
410 self._download_retcode = 1
412 def slow_down(self, start_time, byte_counter):
413 """Sleep if the download speed is over the rate limit."""
414 rate_limit = self.params.get('ratelimit', None)
415 if rate_limit is None or byte_counter == 0:
418 elapsed = now - start_time
421 speed = float(byte_counter) / elapsed
422 if speed > rate_limit:
423 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
425 def temp_name(self, filename):
426 """Returns a temporary filename for the given filename."""
427 if self.params.get('nopart', False) or filename == u'-' or \
428 (os.path.exists(filename) and not os.path.isfile(filename)):
430 return filename + u'.part'
432 def undo_temp_name(self, filename):
433 if filename.endswith(u'.part'):
434 return filename[:-len(u'.part')]
437 def try_rename(self, old_filename, new_filename):
439 if old_filename == new_filename:
441 os.rename(old_filename, new_filename)
442 except (IOError, OSError), err:
443 self.trouble(u'ERROR: unable to rename file')
445 def report_destination(self, filename):
446 """Report destination filename."""
447 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
449 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
450 """Report download progress."""
451 if self.params.get('noprogress', False):
453 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
454 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
455 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
456 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
458 def report_resuming_byte(self, resume_len):
459 """Report attempt to resume at given byte."""
460 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
462 def report_retry(self, count, retries):
463 """Report retry in case of HTTP error 5xx"""
464 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
466 def report_file_already_downloaded(self, file_name):
467 """Report file has already been fully downloaded."""
469 self.to_screen(u'[download] %s has already been downloaded' % file_name)
470 except (UnicodeEncodeError), err:
471 self.to_screen(u'[download] The file has already been downloaded')
473 def report_unable_to_resume(self):
474 """Report it was impossible to resume download."""
475 self.to_screen(u'[download] Unable to resume')
477 def report_finish(self):
478 """Report download finished."""
479 if self.params.get('noprogress', False):
480 self.to_screen(u'[download] Download completed')
484 def increment_downloads(self):
485 """Increment the ordinal that assigns a number to each file."""
486 self._num_downloads += 1
488 def process_info(self, info_dict):
489 """Process a single dictionary returned by an InfoExtractor."""
490 # Do nothing else if in simulate mode
491 if self.params.get('simulate', False):
493 if self.params.get('forcetitle', False):
494 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
495 if self.params.get('forceurl', False):
496 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
497 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
498 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
499 if self.params.get('forcedescription', False) and 'description' in info_dict:
500 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
505 template_dict = dict(info_dict)
506 template_dict['epoch'] = unicode(long(time.time()))
507 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
508 filename = self.params['outtmpl'] % template_dict
509 except (ValueError, KeyError), err:
510 self.trouble(u'ERROR: invalid system charset or erroneous output template')
512 if self.params.get('nooverwrites', False) and os.path.exists(filename):
513 self.to_stderr(u'WARNING: file exists and will be skipped')
517 self.pmkdir(filename)
518 except (OSError, IOError), err:
519 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
523 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
524 except (OSError, IOError), err:
525 raise UnavailableVideoError
526 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
527 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
529 except (ContentTooShortError, ), err:
530 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
535 self.post_process(filename, info_dict)
536 except (PostProcessingError), err:
537 self.trouble(u'ERROR: postprocessing: %s' % str(err))
540 def download(self, url_list):
541 """Download a given list of URLs."""
542 if len(url_list) > 1 and self.fixed_template():
543 raise SameFileError(self.params['outtmpl'])
546 suitable_found = False
548 # Go to next InfoExtractor if not suitable
549 if not ie.suitable(url):
552 # Suitable InfoExtractor found
553 suitable_found = True
555 # Extract information from URL and process it
558 # Suitable InfoExtractor had been found; go to next URL
561 if not suitable_found:
562 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
564 return self._download_retcode
566 def post_process(self, filename, ie_info):
567 """Run the postprocessing chain on the given file."""
569 info['filepath'] = filename
575 def _download_with_rtmpdump(self, filename, url, player_url):
576 self.report_destination(filename)
577 tmpfilename = self.temp_name(filename)
579 # Check for rtmpdump first
581 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
582 except (OSError, IOError):
583 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
586 # Download using rtmpdump. rtmpdump returns exit code 2 when
587 # the connection was interrumpted and resuming appears to be
588 # possible. This is part of rtmpdump's normal usage, AFAIK.
589 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
590 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
591 while retval == 2 or retval == 1:
592 prevsize = os.path.getsize(tmpfilename)
593 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
594 time.sleep(5.0) # This seems to be needed
595 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
596 cursize = os.path.getsize(tmpfilename)
597 if prevsize == cursize and retval == 1:
600 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
601 self.try_rename(tmpfilename, filename)
604 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
607 def _do_download(self, filename, url, player_url):
608 # Check file already present
609 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
610 self.report_file_already_downloaded(filename)
613 # Attempt to download using rtmpdump
614 if url.startswith('rtmp'):
615 return self._download_with_rtmpdump(filename, url, player_url)
617 tmpfilename = self.temp_name(filename)
621 # Do not include the Accept-Encoding header
622 headers = {'Youtubedl-no-compression': 'True'}
623 basic_request = urllib2.Request(url, None, headers)
624 request = urllib2.Request(url, None, headers)
626 # Establish possible resume length
627 if os.path.isfile(tmpfilename):
628 resume_len = os.path.getsize(tmpfilename)
632 # Request parameters in case of being able to resume
633 if self.params.get('continuedl', False) and resume_len != 0:
634 self.report_resuming_byte(resume_len)
635 request.add_header('Range','bytes=%d-' % resume_len)
639 retries = self.params.get('retries', 0)
640 while count <= retries:
641 # Establish connection
643 data = urllib2.urlopen(request)
645 except (urllib2.HTTPError, ), err:
646 if (err.code < 500 or err.code >= 600) and err.code != 416:
647 # Unexpected HTTP error
649 elif err.code == 416:
650 # Unable to resume (requested range not satisfiable)
652 # Open the connection again without the range header
653 data = urllib2.urlopen(basic_request)
654 content_length = data.info()['Content-Length']
655 except (urllib2.HTTPError, ), err:
656 if err.code < 500 or err.code >= 600:
659 # Examine the reported length
660 if (content_length is not None and
661 (resume_len - 100 < long(content_length) < resume_len + 100)):
662 # The file had already been fully downloaded.
663 # Explanation to the above condition: in issue #175 it was revealed that
664 # YouTube sometimes adds or removes a few bytes from the end of the file,
665 # changing the file size slightly and causing problems for some users. So
666 # I decided to implement a suggested change and consider the file
667 # completely downloaded if the file size differs less than 100 bytes from
668 # the one in the hard drive.
669 self.report_file_already_downloaded(filename)
670 self.try_rename(tmpfilename, filename)
673 # The length does not match, we start the download over
674 self.report_unable_to_resume()
680 self.report_retry(count, retries)
683 self.trouble(u'ERROR: giving up after %s retries' % retries)
686 data_len = data.info().get('Content-length', None)
687 if data_len is not None:
688 data_len = long(data_len) + resume_len
689 data_len_str = self.format_bytes(data_len)
690 byte_counter = 0 + resume_len
696 data_block = data.read(block_size)
698 if len(data_block) == 0:
700 byte_counter += len(data_block)
702 # Open file just in time
705 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
706 filename = self.undo_temp_name(tmpfilename)
707 self.report_destination(filename)
708 except (OSError, IOError), err:
709 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
712 stream.write(data_block)
713 except (IOError, OSError), err:
714 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
716 block_size = self.best_block_size(after - before, len(data_block))
719 percent_str = self.calc_percent(byte_counter, data_len)
720 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
721 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
722 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
725 self.slow_down(start, byte_counter - resume_len)
729 if data_len is not None and byte_counter != data_len:
730 raise ContentTooShortError(byte_counter, long(data_len))
731 self.try_rename(tmpfilename, filename)
734 class InfoExtractor(object):
735 """Information Extractor class.
737 Information extractors are the classes that, given a URL, extract
738 information from the video (or videos) the URL refers to. This
739 information includes the real video URL, the video title and simplified
740 title, author and others. The information is stored in a dictionary
741 which is then passed to the FileDownloader. The FileDownloader
742 processes this information possibly downloading the video to the file
743 system, among other possible outcomes. The dictionaries must include
744 the following fields:
746 id: Video identifier.
747 url: Final video URL.
748 uploader: Nickname of the video uploader.
749 title: Literal title.
750 stitle: Simplified title.
751 ext: Video filename extension.
752 format: Video format.
753 player_url: SWF Player URL (may be None).
755 The following fields are optional. Their primary purpose is to allow
756 youtube-dl to serve as the backend for a video search function, such
757 as the one in youtube2mp3. They are only used when their respective
758 forced printing functions are called:
760 thumbnail: Full URL to a video thumbnail image.
761 description: One-line video description.
763 Subclasses of this one should re-define the _real_initialize() and
764 _real_extract() methods, as well as the suitable() static method.
765 Probably, they should also be instantiated and added to the main
772 def __init__(self, downloader=None):
773 """Constructor. Receives an optional downloader."""
775 self.set_downloader(downloader)
779 """Receives a URL and returns True if suitable for this IE."""
782 def initialize(self):
783 """Initializes an instance (authentication, etc)."""
785 self._real_initialize()
788 def extract(self, url):
789 """Extracts URL information and returns it in list of dicts."""
791 return self._real_extract(url)
793 def set_downloader(self, downloader):
794 """Sets the downloader for this IE."""
795 self._downloader = downloader
797 def _real_initialize(self):
798 """Real initialization process. Redefine in subclasses."""
801 def _real_extract(self, url):
802 """Real extraction process. Redefine in subclasses."""
805 class YoutubeIE(InfoExtractor):
806 """Information extractor for youtube.com."""
808 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
809 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
810 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
811 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
812 _NETRC_MACHINE = 'youtube'
813 # Listed in order of quality
814 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
815 _video_extensions = {
821 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
828 return (re.match(YoutubeIE._VALID_URL, url) is not None)
830 def report_lang(self):
831 """Report attempt to set language."""
832 self._downloader.to_screen(u'[youtube] Setting language')
834 def report_login(self):
835 """Report attempt to log in."""
836 self._downloader.to_screen(u'[youtube] Logging in')
838 def report_age_confirmation(self):
839 """Report attempt to confirm age."""
840 self._downloader.to_screen(u'[youtube] Confirming age')
842 def report_video_webpage_download(self, video_id):
843 """Report attempt to download video webpage."""
844 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
846 def report_video_info_webpage_download(self, video_id):
847 """Report attempt to download video info webpage."""
848 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
850 def report_information_extraction(self, video_id):
851 """Report attempt to extract video information."""
852 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
854 def report_unavailable_format(self, video_id, format):
855 """Report extracted video URL."""
856 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
858 def report_rtmp_download(self):
859 """Indicate the download will use the RTMP protocol."""
860 self._downloader.to_screen(u'[youtube] RTMP download detected')
862 def _real_initialize(self):
863 if self._downloader is None:
868 downloader_params = self._downloader.params
870 # Attempt to use provided username and password or .netrc data
871 if downloader_params.get('username', None) is not None:
872 username = downloader_params['username']
873 password = downloader_params['password']
874 elif downloader_params.get('usenetrc', False):
876 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
881 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
882 except (IOError, netrc.NetrcParseError), err:
883 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
887 request = urllib2.Request(self._LANG_URL)
890 urllib2.urlopen(request).read()
891 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
892 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
895 # No authentication to be performed
901 'current_form': 'loginForm',
903 'action_login': 'Log In',
904 'username': username,
905 'password': password,
907 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
910 login_results = urllib2.urlopen(request).read()
911 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
912 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
915 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
921 'action_confirm': 'Confirm',
923 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
925 self.report_age_confirmation()
926 age_results = urllib2.urlopen(request).read()
927 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
928 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
931 def _real_extract(self, url):
932 # Extract video id from URL
933 mobj = re.match(self._VALID_URL, url)
935 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
937 video_id = mobj.group(2)
940 self.report_video_webpage_download(video_id)
941 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
943 video_webpage = urllib2.urlopen(request).read()
944 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
945 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
948 # Attempt to extract SWF player URL
949 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
951 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
956 self.report_video_info_webpage_download(video_id)
957 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
958 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
959 % (video_id, el_type))
960 request = urllib2.Request(video_info_url)
962 video_info_webpage = urllib2.urlopen(request).read()
963 video_info = parse_qs(video_info_webpage)
964 if 'token' in video_info:
966 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
967 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
969 if 'token' not in video_info:
970 if 'reason' in video_info:
971 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
973 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
976 # Start extracting information
977 self.report_information_extraction(video_id)
980 if 'author' not in video_info:
981 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
983 video_uploader = urllib.unquote_plus(video_info['author'][0])
986 if 'title' not in video_info:
987 self._downloader.trouble(u'ERROR: unable to extract video title')
989 video_title = urllib.unquote_plus(video_info['title'][0])
990 video_title = video_title.decode('utf-8')
991 video_title = sanitize_title(video_title)
994 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
995 simple_title = simple_title.strip(ur'_')
998 if 'thumbnail_url' not in video_info:
999 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1000 video_thumbnail = ''
1001 else: # don't panic if we can't find it
1002 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1006 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1007 if mobj is not None:
1008 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1009 format_expressions = ['%d %B %Y', '%B %d %Y']
1010 for expression in format_expressions:
1012 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1017 video_description = 'No description available.'
1018 if self._downloader.params.get('forcedescription', False):
1019 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1020 if mobj is not None:
1021 video_description = mobj.group(1)
1024 video_token = urllib.unquote_plus(video_info['token'][0])
1026 # Decide which formats to download
1027 req_format = self._downloader.params.get('format', None)
1029 if 'fmt_url_map' in video_info:
1030 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1031 format_limit = self._downloader.params.get('format_limit', None)
1032 if format_limit is not None and format_limit in self._available_formats:
1033 format_list = self._available_formats[self._available_formats.index(format_limit):]
1035 format_list = self._available_formats
1036 existing_formats = [x for x in format_list if x in url_map]
1037 if len(existing_formats) == 0:
1038 self._downloader.trouble(u'ERROR: no known formats available for video')
1040 if req_format is None:
1041 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1042 elif req_format == '-1':
1043 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1046 if req_format not in url_map:
1047 self._downloader.trouble(u'ERROR: requested format not available')
1049 video_url_list = [(req_format, url_map[req_format])] # Specific format
1051 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1052 self.report_rtmp_download()
1053 video_url_list = [(None, video_info['conn'][0])]
1056 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1059 for format_param, video_real_url in video_url_list:
1060 # At this point we have a new video
1061 self._downloader.increment_downloads()
1064 video_extension = self._video_extensions.get(format_param, 'flv')
1066 # Find the video URL in fmt_url_map or conn paramters
1068 # Process video information
1069 self._downloader.process_info({
1070 'id': video_id.decode('utf-8'),
1071 'url': video_real_url.decode('utf-8'),
1072 'uploader': video_uploader.decode('utf-8'),
1073 'upload_date': upload_date,
1074 'title': video_title,
1075 'stitle': simple_title,
1076 'ext': video_extension.decode('utf-8'),
1077 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1078 'thumbnail': video_thumbnail.decode('utf-8'),
1079 'description': video_description.decode('utf-8'),
1080 'player_url': player_url,
1082 except UnavailableVideoError, err:
1083 self._downloader.trouble(u'\nERROR: unable to download video')
1086 class MetacafeIE(InfoExtractor):
1087 """Information Extractor for metacafe.com."""
1089 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1090 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1091 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1094 def __init__(self, youtube_ie, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1096 self._youtube_ie = youtube_ie
1100 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1102 def report_disclaimer(self):
1103 """Report disclaimer retrieval."""
1104 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1106 def report_age_confirmation(self):
1107 """Report attempt to confirm age."""
1108 self._downloader.to_screen(u'[metacafe] Confirming age')
1110 def report_download_webpage(self, video_id):
1111 """Report webpage download."""
1112 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1114 def report_extraction(self, video_id):
1115 """Report information extraction."""
1116 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1118 def _real_initialize(self):
1119 # Retrieve disclaimer
1120 request = urllib2.Request(self._DISCLAIMER)
1122 self.report_disclaimer()
1123 disclaimer = urllib2.urlopen(request).read()
1124 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1125 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1131 'submit': "Continue - I'm over 18",
1133 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1135 self.report_age_confirmation()
1136 disclaimer = urllib2.urlopen(request).read()
1137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1141 def _real_extract(self, url):
1142 # Extract id and simplified title from URL
1143 mobj = re.match(self._VALID_URL, url)
1145 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1148 video_id = mobj.group(1)
1150 # Check if video comes from YouTube
1151 mobj2 = re.match(r'^yt-(.*)$', video_id)
1152 if mobj2 is not None:
1153 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1156 # At this point we have a new video
1157 self._downloader.increment_downloads()
1159 simple_title = mobj.group(2).decode('utf-8')
1161 # Retrieve video webpage to extract further information
1162 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1164 self.report_download_webpage(video_id)
1165 webpage = urllib2.urlopen(request).read()
1166 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1167 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1170 # Extract URL, uploader and title from webpage
1171 self.report_extraction(video_id)
1172 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1173 if mobj is not None:
1174 mediaURL = urllib.unquote(mobj.group(1))
1175 video_extension = mediaURL[-3:]
1177 # Extract gdaKey if available
1178 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1180 video_url = mediaURL
1182 gdaKey = mobj.group(1)
1183 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1185 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1187 self._downloader.trouble(u'ERROR: unable to extract media URL')
1189 vardict = parse_qs(mobj.group(1))
1190 if 'mediaData' not in vardict:
1191 self._downloader.trouble(u'ERROR: unable to extract media URL')
1193 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1195 self._downloader.trouble(u'ERROR: unable to extract media URL')
1197 mediaURL = mobj.group(1).replace('\\/', '/')
1198 video_extension = mediaURL[-3:]
1199 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1201 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1203 self._downloader.trouble(u'ERROR: unable to extract title')
1205 video_title = mobj.group(1).decode('utf-8')
1206 video_title = sanitize_title(video_title)
1208 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1210 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1212 video_uploader = mobj.group(1)
1215 # Process video information
1216 self._downloader.process_info({
1217 'id': video_id.decode('utf-8'),
1218 'url': video_url.decode('utf-8'),
1219 'uploader': video_uploader.decode('utf-8'),
1220 'upload_date': u'NA',
1221 'title': video_title,
1222 'stitle': simple_title,
1223 'ext': video_extension.decode('utf-8'),
1227 except UnavailableVideoError:
1228 self._downloader.trouble(u'\nERROR: unable to download video')
1231 class DailymotionIE(InfoExtractor):
1232 """Information Extractor for Dailymotion"""
1234 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1236 def __init__(self, downloader=None):
1237 InfoExtractor.__init__(self, downloader)
1241 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1243 def report_download_webpage(self, video_id):
1244 """Report webpage download."""
1245 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1247 def report_extraction(self, video_id):
1248 """Report information extraction."""
1249 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1251 def _real_initialize(self):
1254 def _real_extract(self, url):
1255 # Extract id and simplified title from URL
1256 mobj = re.match(self._VALID_URL, url)
1258 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1261 # At this point we have a new video
1262 self._downloader.increment_downloads()
1263 video_id = mobj.group(1)
1265 simple_title = mobj.group(2).decode('utf-8')
1266 video_extension = 'flv'
1268 # Retrieve video webpage to extract further information
1269 request = urllib2.Request(url)
1271 self.report_download_webpage(video_id)
1272 webpage = urllib2.urlopen(request).read()
1273 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1274 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1277 # Extract URL, uploader and title from webpage
1278 self.report_extraction(video_id)
1279 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1281 self._downloader.trouble(u'ERROR: unable to extract media URL')
1283 mediaURL = urllib.unquote(mobj.group(1))
1285 # if needed add http://www.dailymotion.com/ if relative URL
1287 video_url = mediaURL
1289 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1290 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1292 self._downloader.trouble(u'ERROR: unable to extract title')
1294 video_title = mobj.group(1).decode('utf-8')
1295 video_title = sanitize_title(video_title)
1297 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1299 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1301 video_uploader = mobj.group(1)
1304 # Process video information
1305 self._downloader.process_info({
1306 'id': video_id.decode('utf-8'),
1307 'url': video_url.decode('utf-8'),
1308 'uploader': video_uploader.decode('utf-8'),
1309 'upload_date': u'NA',
1310 'title': video_title,
1311 'stitle': simple_title,
1312 'ext': video_extension.decode('utf-8'),
1316 except UnavailableVideoError:
1317 self._downloader.trouble(u'\nERROR: unable to download video')
1319 class GoogleIE(InfoExtractor):
1320 """Information extractor for video.google.com."""
1322 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1324 def __init__(self, downloader=None):
1325 InfoExtractor.__init__(self, downloader)
1329 return (re.match(GoogleIE._VALID_URL, url) is not None)
1331 def report_download_webpage(self, video_id):
1332 """Report webpage download."""
1333 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1335 def report_extraction(self, video_id):
1336 """Report information extraction."""
1337 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1339 def _real_initialize(self):
1342 def _real_extract(self, url):
1343 # Extract id from URL
1344 mobj = re.match(self._VALID_URL, url)
1346 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1349 # At this point we have a new video
1350 self._downloader.increment_downloads()
1351 video_id = mobj.group(1)
1353 video_extension = 'mp4'
1355 # Retrieve video webpage to extract further information
1356 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1358 self.report_download_webpage(video_id)
1359 webpage = urllib2.urlopen(request).read()
1360 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1361 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1364 # Extract URL, uploader, and title from webpage
1365 self.report_extraction(video_id)
1366 mobj = re.search(r"download_url:'([^']+)'", webpage)
1368 video_extension = 'flv'
1369 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1371 self._downloader.trouble(u'ERROR: unable to extract media URL')
1373 mediaURL = urllib.unquote(mobj.group(1))
1374 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1375 mediaURL = mediaURL.replace('\\x26', '\x26')
1377 video_url = mediaURL
1379 mobj = re.search(r'<title>(.*)</title>', webpage)
1381 self._downloader.trouble(u'ERROR: unable to extract title')
1383 video_title = mobj.group(1).decode('utf-8')
1384 video_title = sanitize_title(video_title)
1385 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1387 # Extract video description
1388 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1390 self._downloader.trouble(u'ERROR: unable to extract video description')
1392 video_description = mobj.group(1).decode('utf-8')
1393 if not video_description:
1394 video_description = 'No description available.'
1396 # Extract video thumbnail
1397 if self._downloader.params.get('forcethumbnail', False):
1398 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1400 webpage = urllib2.urlopen(request).read()
1401 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1402 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1404 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1406 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1408 video_thumbnail = mobj.group(1)
1409 else: # we need something to pass to process_info
1410 video_thumbnail = ''
1414 # Process video information
1415 self._downloader.process_info({
1416 'id': video_id.decode('utf-8'),
1417 'url': video_url.decode('utf-8'),
1419 'upload_date': u'NA',
1420 'title': video_title,
1421 'stitle': simple_title,
1422 'ext': video_extension.decode('utf-8'),
1426 except UnavailableVideoError:
1427 self._downloader.trouble(u'\nERROR: unable to download video')
1430 class PhotobucketIE(InfoExtractor):
1431 """Information extractor for photobucket.com."""
1433 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1435 def __init__(self, downloader=None):
1436 InfoExtractor.__init__(self, downloader)
1440 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1442 def report_download_webpage(self, video_id):
1443 """Report webpage download."""
1444 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1446 def report_extraction(self, video_id):
1447 """Report information extraction."""
1448 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1450 def _real_initialize(self):
1453 def _real_extract(self, url):
1454 # Extract id from URL
1455 mobj = re.match(self._VALID_URL, url)
1457 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1460 # At this point we have a new video
1461 self._downloader.increment_downloads()
1462 video_id = mobj.group(1)
1464 video_extension = 'flv'
1466 # Retrieve video webpage to extract further information
1467 request = urllib2.Request(url)
1469 self.report_download_webpage(video_id)
1470 webpage = urllib2.urlopen(request).read()
1471 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1472 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1475 # Extract URL, uploader, and title from webpage
1476 self.report_extraction(video_id)
1477 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1479 self._downloader.trouble(u'ERROR: unable to extract media URL')
1481 mediaURL = urllib.unquote(mobj.group(1))
1483 video_url = mediaURL
1485 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1487 self._downloader.trouble(u'ERROR: unable to extract title')
1489 video_title = mobj.group(1).decode('utf-8')
1490 video_title = sanitize_title(video_title)
1491 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1493 video_uploader = mobj.group(2).decode('utf-8')
1496 # Process video information
1497 self._downloader.process_info({
1498 'id': video_id.decode('utf-8'),
1499 'url': video_url.decode('utf-8'),
1500 'uploader': video_uploader,
1501 'upload_date': u'NA',
1502 'title': video_title,
1503 'stitle': simple_title,
1504 'ext': video_extension.decode('utf-8'),
1508 except UnavailableVideoError:
1509 self._downloader.trouble(u'\nERROR: unable to download video')
1512 class YahooIE(InfoExtractor):
1513 """Information extractor for video.yahoo.com."""
1515 # _VALID_URL matches all Yahoo! Video URLs
1516 # _VPAGE_URL matches only the extractable '/watch/' URLs
1517 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1518 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1520 def __init__(self, downloader=None):
1521 InfoExtractor.__init__(self, downloader)
1525 return (re.match(YahooIE._VALID_URL, url) is not None)
1527 def report_download_webpage(self, video_id):
1528 """Report webpage download."""
1529 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1531 def report_extraction(self, video_id):
1532 """Report information extraction."""
1533 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1535 def _real_initialize(self):
1538 def _real_extract(self, url, new_video=True):
1539 # Extract ID from URL
1540 mobj = re.match(self._VALID_URL, url)
1542 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1545 # At this point we have a new video
1546 self._downloader.increment_downloads()
1547 video_id = mobj.group(2)
1548 video_extension = 'flv'
1550 # Rewrite valid but non-extractable URLs as
1551 # extractable English language /watch/ URLs
1552 if re.match(self._VPAGE_URL, url) is None:
1553 request = urllib2.Request(url)
1555 webpage = urllib2.urlopen(request).read()
1556 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1557 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1560 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1562 self._downloader.trouble(u'ERROR: Unable to extract id field')
1564 yahoo_id = mobj.group(1)
1566 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1568 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1570 yahoo_vid = mobj.group(1)
1572 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1573 return self._real_extract(url, new_video=False)
1575 # Retrieve video webpage to extract further information
1576 request = urllib2.Request(url)
1578 self.report_download_webpage(video_id)
1579 webpage = urllib2.urlopen(request).read()
1580 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1581 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1584 # Extract uploader and title from webpage
1585 self.report_extraction(video_id)
1586 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1588 self._downloader.trouble(u'ERROR: unable to extract video title')
1590 video_title = mobj.group(1).decode('utf-8')
1591 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1593 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1595 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1597 video_uploader = mobj.group(1).decode('utf-8')
1599 # Extract video thumbnail
1600 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1602 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1604 video_thumbnail = mobj.group(1).decode('utf-8')
1606 # Extract video description
1607 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1609 self._downloader.trouble(u'ERROR: unable to extract video description')
1611 video_description = mobj.group(1).decode('utf-8')
1612 if not video_description: video_description = 'No description available.'
1614 # Extract video height and width
1615 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1617 self._downloader.trouble(u'ERROR: unable to extract video height')
1619 yv_video_height = mobj.group(1)
1621 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1623 self._downloader.trouble(u'ERROR: unable to extract video width')
1625 yv_video_width = mobj.group(1)
1627 # Retrieve video playlist to extract media URL
1628 # I'm not completely sure what all these options are, but we
1629 # seem to need most of them, otherwise the server sends a 401.
1630 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1631 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1632 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1633 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1634 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1636 self.report_download_webpage(video_id)
1637 webpage = urllib2.urlopen(request).read()
1638 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1639 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1642 # Extract media URL from playlist XML
1643 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1645 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1647 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1648 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1651 # Process video information
1652 self._downloader.process_info({
1653 'id': video_id.decode('utf-8'),
1655 'uploader': video_uploader,
1656 'upload_date': u'NA',
1657 'title': video_title,
1658 'stitle': simple_title,
1659 'ext': video_extension.decode('utf-8'),
1660 'thumbnail': video_thumbnail.decode('utf-8'),
1661 'description': video_description,
1662 'thumbnail': video_thumbnail,
1663 'description': video_description,
1666 except UnavailableVideoError:
1667 self._downloader.trouble(u'\nERROR: unable to download video')
1670 class GenericIE(InfoExtractor):
1671 """Generic last-resort information extractor."""
1673 def __init__(self, downloader=None):
1674 InfoExtractor.__init__(self, downloader)
1680 def report_download_webpage(self, video_id):
1681 """Report webpage download."""
1682 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1683 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1685 def report_extraction(self, video_id):
1686 """Report information extraction."""
1687 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1689 def _real_initialize(self):
1692 def _real_extract(self, url):
1693 # At this point we have a new video
1694 self._downloader.increment_downloads()
1696 video_id = url.split('/')[-1]
1697 request = urllib2.Request(url)
1699 self.report_download_webpage(video_id)
1700 webpage = urllib2.urlopen(request).read()
1701 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1702 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1704 except ValueError, err:
1705 # since this is the last-resort InfoExtractor, if
1706 # this error is thrown, it'll be thrown here
1707 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1710 self.report_extraction(video_id)
1711 # Start with something easy: JW Player in SWFObject
1712 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1714 # Broaden the search a little bit
1715 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1717 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1720 # It's possible that one of the regexes
1721 # matched, but returned an empty group:
1722 if mobj.group(1) is None:
1723 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1726 video_url = urllib.unquote(mobj.group(1))
1727 video_id = os.path.basename(video_url)
1729 # here's a fun little line of code for you:
1730 video_extension = os.path.splitext(video_id)[1][1:]
1731 video_id = os.path.splitext(video_id)[0]
1733 # it's tempting to parse this further, but you would
1734 # have to take into account all the variations like
1735 # Video Title - Site Name
1736 # Site Name | Video Title
1737 # Video Title - Tagline | Site Name
1738 # and so on and so forth; it's just not practical
1739 mobj = re.search(r'<title>(.*)</title>', webpage)
1741 self._downloader.trouble(u'ERROR: unable to extract title')
1743 video_title = mobj.group(1).decode('utf-8')
1744 video_title = sanitize_title(video_title)
1745 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1747 # video uploader is domain name
1748 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1750 self._downloader.trouble(u'ERROR: unable to extract title')
1752 video_uploader = mobj.group(1).decode('utf-8')
1755 # Process video information
1756 self._downloader.process_info({
1757 'id': video_id.decode('utf-8'),
1758 'url': video_url.decode('utf-8'),
1759 'uploader': video_uploader,
1760 'upload_date': u'NA',
1761 'title': video_title,
1762 'stitle': simple_title,
1763 'ext': video_extension.decode('utf-8'),
1767 except UnavailableVideoError, err:
1768 self._downloader.trouble(u'\nERROR: unable to download video')
1771 class YoutubeSearchIE(InfoExtractor):
1772 """Information Extractor for YouTube search queries."""
1773 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1774 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1775 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1776 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1778 _max_youtube_results = 1000
1780 def __init__(self, youtube_ie, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1782 self._youtube_ie = youtube_ie
1786 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1788 def report_download_page(self, query, pagenum):
1789 """Report attempt to download playlist page with given number."""
1790 query = query.decode(preferredencoding())
1791 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1793 def _real_initialize(self):
1794 self._youtube_ie.initialize()
1796 def _real_extract(self, query):
1797 mobj = re.match(self._VALID_QUERY, query)
1799 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1802 prefix, query = query.split(':')
1804 query = query.encode('utf-8')
1806 self._download_n_results(query, 1)
1808 elif prefix == 'all':
1809 self._download_n_results(query, self._max_youtube_results)
1815 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1817 elif n > self._max_youtube_results:
1818 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1819 n = self._max_youtube_results
1820 self._download_n_results(query, n)
1822 except ValueError: # parsing prefix as integer fails
1823 self._download_n_results(query, 1)
1826 def _download_n_results(self, query, n):
1827 """Downloads a specified number of results for a query"""
1830 already_seen = set()
1834 self.report_download_page(query, pagenum)
1835 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1836 request = urllib2.Request(result_url)
1838 page = urllib2.urlopen(request).read()
1839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1843 # Extract video identifiers
1844 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1846 if video_id not in already_seen:
1847 video_ids.append(video_id)
1848 already_seen.add(video_id)
1849 if len(video_ids) == n:
1850 # Specified n videos reached
1851 for id in video_ids:
1852 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1855 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1856 for id in video_ids:
1857 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1860 pagenum = pagenum + 1
1862 class GoogleSearchIE(InfoExtractor):
1863 """Information Extractor for Google Video search queries."""
1864 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1865 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1866 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1867 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1869 _max_google_results = 1000
1871 def __init__(self, google_ie, downloader=None):
1872 InfoExtractor.__init__(self, downloader)
1873 self._google_ie = google_ie
1877 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1879 def report_download_page(self, query, pagenum):
1880 """Report attempt to download playlist page with given number."""
1881 query = query.decode(preferredencoding())
1882 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1884 def _real_initialize(self):
1885 self._google_ie.initialize()
1887 def _real_extract(self, query):
1888 mobj = re.match(self._VALID_QUERY, query)
1890 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1893 prefix, query = query.split(':')
1895 query = query.encode('utf-8')
1897 self._download_n_results(query, 1)
1899 elif prefix == 'all':
1900 self._download_n_results(query, self._max_google_results)
1906 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1908 elif n > self._max_google_results:
1909 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1910 n = self._max_google_results
1911 self._download_n_results(query, n)
1913 except ValueError: # parsing prefix as integer fails
1914 self._download_n_results(query, 1)
1917 def _download_n_results(self, query, n):
1918 """Downloads a specified number of results for a query"""
1921 already_seen = set()
1925 self.report_download_page(query, pagenum)
1926 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1927 request = urllib2.Request(result_url)
1929 page = urllib2.urlopen(request).read()
1930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1934 # Extract video identifiers
1935 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936 video_id = mobj.group(1)
1937 if video_id not in already_seen:
1938 video_ids.append(video_id)
1939 already_seen.add(video_id)
1940 if len(video_ids) == n:
1941 # Specified n videos reached
1942 for id in video_ids:
1943 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1946 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1947 for id in video_ids:
1948 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1951 pagenum = pagenum + 1
1953 class YahooSearchIE(InfoExtractor):
1954 """Information Extractor for Yahoo! Video search queries."""
1955 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1956 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1957 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1958 _MORE_PAGES_INDICATOR = r'\s*Next'
1960 _max_yahoo_results = 1000
1962 def __init__(self, yahoo_ie, downloader=None):
1963 InfoExtractor.__init__(self, downloader)
1964 self._yahoo_ie = yahoo_ie
1968 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1970 def report_download_page(self, query, pagenum):
1971 """Report attempt to download playlist page with given number."""
1972 query = query.decode(preferredencoding())
1973 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1975 def _real_initialize(self):
1976 self._yahoo_ie.initialize()
1978 def _real_extract(self, query):
1979 mobj = re.match(self._VALID_QUERY, query)
1981 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1984 prefix, query = query.split(':')
1986 query = query.encode('utf-8')
1988 self._download_n_results(query, 1)
1990 elif prefix == 'all':
1991 self._download_n_results(query, self._max_yahoo_results)
1997 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1999 elif n > self._max_yahoo_results:
2000 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2001 n = self._max_yahoo_results
2002 self._download_n_results(query, n)
2004 except ValueError: # parsing prefix as integer fails
2005 self._download_n_results(query, 1)
2008 def _download_n_results(self, query, n):
2009 """Downloads a specified number of results for a query"""
2012 already_seen = set()
2016 self.report_download_page(query, pagenum)
2017 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2018 request = urllib2.Request(result_url)
2020 page = urllib2.urlopen(request).read()
2021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2022 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2025 # Extract video identifiers
2026 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2027 video_id = mobj.group(1)
2028 if video_id not in already_seen:
2029 video_ids.append(video_id)
2030 already_seen.add(video_id)
2031 if len(video_ids) == n:
2032 # Specified n videos reached
2033 for id in video_ids:
2034 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2037 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2038 for id in video_ids:
2039 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2042 pagenum = pagenum + 1
2044 class YoutubePlaylistIE(InfoExtractor):
2045 """Information Extractor for YouTube playlists."""
2047 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2048 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2049 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2050 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2053 def __init__(self, youtube_ie, downloader=None):
2054 InfoExtractor.__init__(self, downloader)
2055 self._youtube_ie = youtube_ie
2059 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2061 def report_download_page(self, playlist_id, pagenum):
2062 """Report attempt to download playlist page with given number."""
2063 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2065 def _real_initialize(self):
2066 self._youtube_ie.initialize()
2068 def _real_extract(self, url):
2069 # Extract playlist id
2070 mobj = re.match(self._VALID_URL, url)
2072 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2075 # Download playlist pages
2076 playlist_id = mobj.group(1)
2081 self.report_download_page(playlist_id, pagenum)
2082 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2084 page = urllib2.urlopen(request).read()
2085 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2086 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2089 # Extract video identifiers
2091 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2092 if mobj.group(1) not in ids_in_page:
2093 ids_in_page.append(mobj.group(1))
2094 video_ids.extend(ids_in_page)
2096 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2098 pagenum = pagenum + 1
2100 playliststart = self._downloader.params.get('playliststart', 1) - 1
2101 playlistend = self._downloader.params.get('playlistend', -1)
2102 video_ids = video_ids[playliststart:playlistend]
2104 for id in video_ids:
2105 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2108 class YoutubeUserIE(InfoExtractor):
2109 """Information Extractor for YouTube users."""
2111 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2112 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2113 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2116 def __init__(self, youtube_ie, downloader=None):
2117 InfoExtractor.__init__(self, downloader)
2118 self._youtube_ie = youtube_ie
2122 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2124 def report_download_page(self, username):
2125 """Report attempt to download user page."""
2126 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2128 def _real_initialize(self):
2129 self._youtube_ie.initialize()
2131 def _real_extract(self, url):
2133 mobj = re.match(self._VALID_URL, url)
2135 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2138 # Download user page
2139 username = mobj.group(1)
2143 self.report_download_page(username)
2144 request = urllib2.Request(self._TEMPLATE_URL % (username))
2146 page = urllib2.urlopen(request).read()
2147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2148 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2151 # Extract video identifiers
2154 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2155 if mobj.group(1) not in ids_in_page:
2156 ids_in_page.append(mobj.group(1))
2157 video_ids.extend(ids_in_page)
2159 playliststart = self._downloader.params.get('playliststart', 1) - 1
2160 playlistend = self._downloader.params.get('playlistend', -1)
2161 video_ids = video_ids[playliststart:playlistend]
2163 for id in video_ids:
2164 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2167 class DepositFilesIE(InfoExtractor):
2168 """Information extractor for depositfiles.com"""
2170 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2172 def __init__(self, downloader=None):
2173 InfoExtractor.__init__(self, downloader)
2177 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2179 def report_download_webpage(self, file_id):
2180 """Report webpage download."""
2181 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2183 def report_extraction(self, file_id):
2184 """Report information extraction."""
2185 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2187 def _real_initialize(self):
2190 def _real_extract(self, url):
2191 # At this point we have a new file
2192 self._downloader.increment_downloads()
2194 file_id = url.split('/')[-1]
2195 # Rebuild url in english locale
2196 url = 'http://depositfiles.com/en/files/' + file_id
2198 # Retrieve file webpage with 'Free download' button pressed
2199 free_download_indication = { 'gateway_result' : '1' }
2200 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2202 self.report_download_webpage(file_id)
2203 webpage = urllib2.urlopen(request).read()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2208 # Search for the real file URL
2209 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2210 if (mobj is None) or (mobj.group(1) is None):
2211 # Try to figure out reason of the error.
2212 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2213 if (mobj is not None) and (mobj.group(1) is not None):
2214 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2215 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2217 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2220 file_url = mobj.group(1)
2221 file_extension = os.path.splitext(file_url)[1][1:]
2223 # Search for file title
2224 mobj = re.search(r'<b title="(.*?)">', webpage)
2226 self._downloader.trouble(u'ERROR: unable to extract title')
2228 file_title = mobj.group(1).decode('utf-8')
2231 # Process file information
2232 self._downloader.process_info({
2233 'id': file_id.decode('utf-8'),
2234 'url': file_url.decode('utf-8'),
2236 'upload_date': u'NA',
2237 'title': file_title,
2238 'stitle': file_title,
2239 'ext': file_extension.decode('utf-8'),
2243 except UnavailableVideoError, err:
2244 self._downloader.trouble(u'ERROR: unable to download file')
2246 class PostProcessor(object):
2247 """Post Processor class.
2249 PostProcessor objects can be added to downloaders with their
2250 add_post_processor() method. When the downloader has finished a
2251 successful download, it will take its internal chain of PostProcessors
2252 and start calling the run() method on each one of them, first with
2253 an initial argument and then with the returned value of the previous
2256 The chain will be stopped if one of them ever returns None or the end
2257 of the chain is reached.
2259 PostProcessor objects follow a "mutual registration" process similar
2260 to InfoExtractor objects.
2265 def __init__(self, downloader=None):
2266 self._downloader = downloader
2268 def set_downloader(self, downloader):
2269 """Sets the downloader for this PP."""
2270 self._downloader = downloader
2272 def run(self, information):
2273 """Run the PostProcessor.
2275 The "information" argument is a dictionary like the ones
2276 composed by InfoExtractors. The only difference is that this
2277 one has an extra field called "filepath" that points to the
2280 When this method returns None, the postprocessing chain is
2281 stopped. However, this method may return an information
2282 dictionary that will be passed to the next postprocessing
2283 object in the chain. It can be the one it received after
2284 changing some fields.
2286 In addition, this method may raise a PostProcessingError
2287 exception that will be taken into account by the downloader
2290 return information # by default, do nothing
2292 ### MAIN PROGRAM ###
2293 if __name__ == '__main__':
2295 # Modules needed only when running the main program
2299 # Function to update the program file with the latest version from bitbucket.org
2300 def update_self(downloader, filename):
2301 # Note: downloader only used for options
2302 if not os.access (filename, os.W_OK):
2303 sys.exit('ERROR: no write permissions on %s' % filename)
2305 downloader.to_screen('Updating to latest stable version...')
2306 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2307 latest_version = urllib.urlopen(latest_url).read().strip()
2308 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2309 newcontent = urllib.urlopen(prog_url).read()
2310 stream = open(filename, 'w')
2311 stream.write(newcontent)
2313 downloader.to_screen('Updated to version %s' % latest_version)
2315 # Parse command line
2316 parser = optparse.OptionParser(
2317 usage='Usage: %prog [options] url...',
2318 version='2010.12.09',
2319 conflict_handler='resolve',
2322 parser.add_option('-h', '--help',
2323 action='help', help='print this help text and exit')
2324 parser.add_option('-v', '--version',
2325 action='version', help='print program version and exit')
2326 parser.add_option('-U', '--update',
2327 action='store_true', dest='update_self', help='update this program to latest stable version')
2328 parser.add_option('-i', '--ignore-errors',
2329 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2330 parser.add_option('-r', '--rate-limit',
2331 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2332 parser.add_option('-R', '--retries',
2333 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2334 parser.add_option('--playlist-start',
2335 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2336 parser.add_option('--playlist-end',
2337 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2338 parser.add_option('--dump-user-agent',
2339 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2341 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2342 authentication.add_option('-u', '--username',
2343 dest='username', metavar='USERNAME', help='account username')
2344 authentication.add_option('-p', '--password',
2345 dest='password', metavar='PASSWORD', help='account password')
2346 authentication.add_option('-n', '--netrc',
2347 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2348 parser.add_option_group(authentication)
2350 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2351 video_format.add_option('-f', '--format',
2352 action='store', dest='format', metavar='FORMAT', help='video format code')
2353 video_format.add_option('--all-formats',
2354 action='store_const', dest='format', help='download all available video formats', const='-1')
2355 video_format.add_option('--max-quality',
2356 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2357 parser.add_option_group(video_format)
2359 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2360 verbosity.add_option('-q', '--quiet',
2361 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2362 verbosity.add_option('-s', '--simulate',
2363 action='store_true', dest='simulate', help='do not download video', default=False)
2364 verbosity.add_option('-g', '--get-url',
2365 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2366 verbosity.add_option('-e', '--get-title',
2367 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2368 verbosity.add_option('--get-thumbnail',
2369 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2370 verbosity.add_option('--get-description',
2371 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2372 verbosity.add_option('--no-progress',
2373 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2374 verbosity.add_option('--console-title',
2375 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2376 parser.add_option_group(verbosity)
2378 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2379 filesystem.add_option('-t', '--title',
2380 action='store_true', dest='usetitle', help='use title in file name', default=False)
2381 filesystem.add_option('-l', '--literal',
2382 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2383 filesystem.add_option('-A', '--auto-number',
2384 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2385 filesystem.add_option('-o', '--output',
2386 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2387 filesystem.add_option('-a', '--batch-file',
2388 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2389 filesystem.add_option('-w', '--no-overwrites',
2390 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2391 filesystem.add_option('-c', '--continue',
2392 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2393 filesystem.add_option('--cookies',
2394 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2395 filesystem.add_option('--no-part',
2396 action='store_true', dest='nopart', help='do not use .part files', default=False)
2397 parser.add_option_group(filesystem)
2399 (opts, args) = parser.parse_args()
2401 # Open appropriate CookieJar
2402 if opts.cookiefile is None:
2403 jar = cookielib.CookieJar()
2406 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2407 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2409 except (IOError, OSError), err:
2410 sys.exit(u'ERROR: unable to open cookie file')
2413 if opts.dump_user_agent:
2414 print std_headers['User-Agent']
2417 # General configuration
2418 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2419 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2420 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2422 # Batch file verification
2424 if opts.batchfile is not None:
2426 if opts.batchfile == '-':
2429 batchfd = open(opts.batchfile, 'r')
2430 batchurls = batchfd.readlines()
2431 batchurls = [x.strip() for x in batchurls]
2432 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2434 sys.exit(u'ERROR: batch file could not be read')
2435 all_urls = batchurls + args
2437 # Conflicting, missing and erroneous options
2438 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2439 parser.error(u'using .netrc conflicts with giving username/password')
2440 if opts.password is not None and opts.username is None:
2441 parser.error(u'account username missing')
2442 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2443 parser.error(u'using output template conflicts with using title, literal title or auto number')
2444 if opts.usetitle and opts.useliteral:
2445 parser.error(u'using title conflicts with using literal title')
2446 if opts.username is not None and opts.password is None:
2447 opts.password = getpass.getpass(u'Type account password and press return:')
2448 if opts.ratelimit is not None:
2449 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2450 if numeric_limit is None:
2451 parser.error(u'invalid rate limit specified')
2452 opts.ratelimit = numeric_limit
2453 if opts.retries is not None:
2455 opts.retries = long(opts.retries)
2456 except (TypeError, ValueError), err:
2457 parser.error(u'invalid retry count specified')
2459 opts.playliststart = long(opts.playliststart)
2460 if opts.playliststart <= 0:
2462 except (TypeError, ValueError), err:
2463 parser.error(u'invalid playlist start number specified')
2465 opts.playlistend = long(opts.playlistend)
2466 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2468 except (TypeError, ValueError), err:
2469 parser.error(u'invalid playlist end number specified')
2471 # Information extractors
2472 youtube_ie = YoutubeIE()
2473 metacafe_ie = MetacafeIE(youtube_ie)
2474 dailymotion_ie = DailymotionIE()
2475 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2476 youtube_user_ie = YoutubeUserIE(youtube_ie)
2477 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2478 google_ie = GoogleIE()
2479 google_search_ie = GoogleSearchIE(google_ie)
2480 photobucket_ie = PhotobucketIE()
2481 yahoo_ie = YahooIE()
2482 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2483 deposit_files_ie = DepositFilesIE()
2484 generic_ie = GenericIE()
2487 fd = FileDownloader({
2488 'usenetrc': opts.usenetrc,
2489 'username': opts.username,
2490 'password': opts.password,
2491 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2492 'forceurl': opts.geturl,
2493 'forcetitle': opts.gettitle,
2494 'forcethumbnail': opts.getthumbnail,
2495 'forcedescription': opts.getdescription,
2496 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2497 'format': opts.format,
2498 'format_limit': opts.format_limit,
2499 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2500 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2501 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2502 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2503 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2504 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2505 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2506 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2507 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2508 or u'%(id)s.%(ext)s'),
2509 'ignoreerrors': opts.ignoreerrors,
2510 'ratelimit': opts.ratelimit,
2511 'nooverwrites': opts.nooverwrites,
2512 'retries': opts.retries,
2513 'continuedl': opts.continue_dl,
2514 'noprogress': opts.noprogress,
2515 'playliststart': opts.playliststart,
2516 'playlistend': opts.playlistend,
2517 'logtostderr': opts.outtmpl == '-',
2518 'consoletitle': opts.consoletitle,
2519 'nopart': opts.nopart,
2521 fd.add_info_extractor(youtube_search_ie)
2522 fd.add_info_extractor(youtube_pl_ie)
2523 fd.add_info_extractor(youtube_user_ie)
2524 fd.add_info_extractor(metacafe_ie)
2525 fd.add_info_extractor(dailymotion_ie)
2526 fd.add_info_extractor(youtube_ie)
2527 fd.add_info_extractor(google_ie)
2528 fd.add_info_extractor(google_search_ie)
2529 fd.add_info_extractor(photobucket_ie)
2530 fd.add_info_extractor(yahoo_ie)
2531 fd.add_info_extractor(yahoo_search_ie)
2532 fd.add_info_extractor(deposit_files_ie)
2534 # This must come last since it's the
2535 # fallback if none of the others work
2536 fd.add_info_extractor(generic_ie)
2539 if opts.update_self:
2540 update_self(fd, sys.argv[0])
2543 if len(all_urls) < 1:
2544 if not opts.update_self:
2545 parser.error(u'you must provide at least one URL')
2548 retcode = fd.download(all_urls)
2550 # Dump cookie jar if requested
2551 if opts.cookiefile is not None:
2554 except (IOError, OSError), err:
2555 sys.exit(u'ERROR: unable to save cookie jar')
2559 except DownloadError:
2561 except SameFileError:
2562 sys.exit(u'ERROR: fixed output name but more than one file to download')
2563 except KeyboardInterrupt:
2564 sys.exit(u'\nERROR: Interrupted by user')