2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # Author: Witold Baryluk
8 # License: Public domain code
31 # parse_qs was moved from the cgi module to the urlparse module recently.
33 from urlparse import parse_qs
35 from cgi import parse_qs
38 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
39 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
40 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 'Accept-Encoding': 'gzip, deflate',
42 'Accept-Language': 'en-us,en;q=0.5',
45 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
47 def preferredencoding():
48 """Get preferred encoding.
50 Returns the best encoding scheme for the system, based on
51 locale.getpreferredencoding() and some further tweaks.
53 def yield_preferredencoding():
55 pref = locale.getpreferredencoding()
61 return yield_preferredencoding().next()
63 def htmlentity_transform(matchobj):
64 """Transforms an HTML entity to a Unicode character.
66 This function receives a match object and is intended to be used with
67 the re.sub() function.
69 entity = matchobj.group(1)
71 # Known non-numeric HTML entity
72 if entity in htmlentitydefs.name2codepoint:
73 return unichr(htmlentitydefs.name2codepoint[entity])
76 mobj = re.match(ur'(?u)#(x?\d+)', entity)
78 numstr = mobj.group(1)
79 if numstr.startswith(u'x'):
81 numstr = u'0%s' % numstr
84 return unichr(long(numstr, base))
86 # Unknown entity in name, return its literal representation
87 return (u'&%s;' % entity)
89 def sanitize_title(utitle):
90 """Sanitizes a video title so it could be used as part of a filename."""
91 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
92 return utitle.replace(unicode(os.sep), u'%')
94 def sanitize_open(filename, open_mode):
95 """Try to open the given filename, and slightly tweak it if this fails.
97 Attempts to open the given filename. If this fails, it tries to change
98 the filename slightly, step by step, until it's either able to open it
99 or it fails and raises a final exception, like the standard open()
102 It returns the tuple (stream, definitive_file_name).
106 if sys.platform == 'win32':
108 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
109 return (sys.stdout, filename)
110 stream = open(filename, open_mode)
111 return (stream, filename)
112 except (IOError, OSError), err:
113 # In case of error, try to remove win32 forbidden chars
114 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
116 # An exception here should be caught in the caller
117 stream = open(filename, open_mode)
118 return (stream, filename)
120 class DownloadError(Exception):
121 """Download Error exception.
123 This exception may be thrown by FileDownloader objects if they are not
124 configured to continue on errors. They will contain the appropriate
129 class SameFileError(Exception):
130 """Same File exception.
132 This exception will be thrown by FileDownloader objects if they detect
133 multiple files would have to be downloaded to the same file on disk.
137 class PostProcessingError(Exception):
138 """Post Processing exception.
140 This exception may be raised by PostProcessor's .run() method to
141 indicate an error in the postprocessing task.
145 class UnavailableVideoError(Exception):
146 """Unavailable Format exception.
148 This exception will be thrown when a video is requested
149 in a format that is not available for that video.
153 class ContentTooShortError(Exception):
154 """Content Too Short exception.
156 This exception may be raised by FileDownloader objects when a file they
157 download is too small for what the server announced first, indicating
158 the connection was probably interrupted.
164 def __init__(self, downloaded, expected):
165 self.downloaded = downloaded
166 self.expected = expected
168 class YoutubeDLHandler(urllib2.HTTPHandler):
169 """Handler for HTTP requests and responses.
171 This class, when installed with an OpenerDirector, automatically adds
172 the standard headers to every HTTP request and handles gzipped and
173 deflated responses from web servers. If compression is to be avoided in
174 a particular request, the original request in the program code only has
175 to include the HTTP header "Youtubedl-No-Compression", which will be
176 removed before making the real request.
178 Part of this code was copied from:
180 http://techknack.net/python-urllib2-handlers/
182 Andrew Rowls, the author of that code, agreed to release it to the
189 return zlib.decompress(data, -zlib.MAX_WBITS)
191 return zlib.decompress(data)
194 def addinfourl_wrapper(stream, headers, url, code):
195 if hasattr(urllib2.addinfourl, 'getcode'):
196 return urllib2.addinfourl(stream, headers, url, code)
197 ret = urllib2.addinfourl(stream, headers, url)
201 def http_request(self, req):
202 for h in std_headers:
205 req.add_header(h, std_headers[h])
206 if 'Youtubedl-no-compression' in req.headers:
207 if 'Accept-encoding' in req.headers:
208 del req.headers['Accept-encoding']
209 del req.headers['Youtubedl-no-compression']
212 def http_response(self, req, resp):
215 if resp.headers.get('Content-encoding', '') == 'gzip':
216 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
217 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
218 resp.msg = old_resp.msg
220 if resp.headers.get('Content-encoding', '') == 'deflate':
221 gz = StringIO.StringIO(self.deflate(resp.read()))
222 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
223 resp.msg = old_resp.msg
226 class FileDownloader(object):
227 """File Downloader class.
229 File downloader objects are the ones responsible of downloading the
230 actual video file and writing it to disk if the user has requested
231 it, among some other tasks. In most cases there should be one per
232 program. As, given a video URL, the downloader doesn't know how to
233 extract all the needed information, task that InfoExtractors do, it
234 has to pass the URL to one of them.
236 For this, file downloader objects have a method that allows
237 InfoExtractors to be registered in a given order. When it is passed
238 a URL, the file downloader handles it to the first InfoExtractor it
239 finds that reports being able to handle it. The InfoExtractor extracts
240 all the information about the video or videos the URL refers to, and
241 asks the FileDownloader to process the video information, possibly
242 downloading the video.
244 File downloaders accept a lot of parameters. In order not to saturate
245 the object constructor with arguments, it receives a dictionary of
246 options instead. These options are available through the params
247 attribute for the InfoExtractors to use. The FileDownloader also
248 registers itself as the downloader in charge for the InfoExtractors
249 that are added to it, so this is a "mutual registration".
253 username: Username for authentication purposes.
254 password: Password for authentication purposes.
255 usenetrc: Use netrc for authentication instead.
256 quiet: Do not print messages to stdout.
257 forceurl: Force printing final URL.
258 forcetitle: Force printing title.
259 forcethumbnail: Force printing thumbnail URL.
260 forcedescription: Force printing description.
261 simulate: Do not download the video files.
262 format: Video format code.
263 format_limit: Highest quality format to try.
264 outtmpl: Template for output names.
265 ignoreerrors: Do not stop on download errors.
266 ratelimit: Download speed limit, in bytes/sec.
267 nooverwrites: Prevent overwriting files.
268 retries: Number of times to retry for HTTP error 5xx
269 continuedl: Try to continue downloads if possible.
270 noprogress: Do not print the progress bar.
271 playliststart: Playlist item to start at.
272 playlistend: Playlist item to end at.
273 logtostderr: Log messages to stderr instead of stdout.
274 consoletitle: Display progress in console window's titlebar.
275 nopart: Do not use temporary .part files.
281 _download_retcode = None
282 _num_downloads = None
285 def __init__(self, params):
286 """Create a FileDownloader object with the given options."""
289 self._download_retcode = 0
290 self._num_downloads = 0
291 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
295 def pmkdir(filename):
296 """Create directory components in filename. Similar to Unix "mkdir -p"."""
297 components = filename.split(os.sep)
298 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
299 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
300 for dir in aggregate:
301 if not os.path.exists(dir):
305 def format_bytes(bytes):
308 if type(bytes) is str:
313 exponent = long(math.log(bytes, 1024.0))
314 suffix = 'bkMGTPEZY'[exponent]
315 converted = float(bytes) / float(1024**exponent)
316 return '%.2f%s' % (converted, suffix)
319 def calc_percent(byte_counter, data_len):
322 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
325 def calc_eta(start, now, total, current):
329 if current == 0 or dif < 0.001: # One millisecond
331 rate = float(current) / dif
332 eta = long((float(total) - float(current)) / rate)
333 (eta_mins, eta_secs) = divmod(eta, 60)
336 return '%02d:%02d' % (eta_mins, eta_secs)
339 def calc_speed(start, now, bytes):
341 if bytes == 0 or dif < 0.001: # One millisecond
342 return '%10s' % '---b/s'
343 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
346 def best_block_size(elapsed_time, bytes):
347 new_min = max(bytes / 2.0, 1.0)
348 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
349 if elapsed_time < 0.001:
351 rate = bytes / elapsed_time
359 def parse_bytes(bytestr):
360 """Parse a string indicating a byte quantity into a long integer."""
361 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
364 number = float(matchobj.group(1))
365 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
366 return long(round(number * multiplier))
368 def add_info_extractor(self, ie):
369 """Add an InfoExtractor object to the end of the list."""
371 ie.set_downloader(self)
373 def add_post_processor(self, pp):
374 """Add a PostProcessor object to the end of the chain."""
376 pp.set_downloader(self)
378 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
379 """Print message to stdout if not in quiet mode."""
381 if not self.params.get('quiet', False):
382 terminator = [u'\n', u''][skip_eol]
383 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
384 self._screen_file.flush()
385 except (UnicodeEncodeError), err:
386 if not ignore_encoding_errors:
389 def to_stderr(self, message):
390 """Print message to stderr."""
391 print >>sys.stderr, message.encode(preferredencoding())
393 def to_cons_title(self, message):
394 """Set console/terminal window title to message."""
395 if not self.params.get('consoletitle', False):
397 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
398 # c_wchar_p() might not be necessary if `message` is
399 # already of type unicode()
400 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
401 elif 'TERM' in os.environ:
402 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
404 def fixed_template(self):
405 """Checks if the output template is fixed."""
406 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
408 def trouble(self, message=None):
409 """Determine action to take when a download problem appears.
411 Depending on if the downloader has been configured to ignore
412 download errors or not, this method may throw an exception or
413 not when errors are found, after printing the message.
415 if message is not None:
416 self.to_stderr(message)
417 if not self.params.get('ignoreerrors', False):
418 raise DownloadError(message)
419 self._download_retcode = 1
421 def slow_down(self, start_time, byte_counter):
422 """Sleep if the download speed is over the rate limit."""
423 rate_limit = self.params.get('ratelimit', None)
424 if rate_limit is None or byte_counter == 0:
427 elapsed = now - start_time
430 speed = float(byte_counter) / elapsed
431 if speed > rate_limit:
432 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
434 def temp_name(self, filename):
435 """Returns a temporary filename for the given filename."""
436 if self.params.get('nopart', False) or filename == u'-' or \
437 (os.path.exists(filename) and not os.path.isfile(filename)):
439 return filename + u'.part'
441 def undo_temp_name(self, filename):
442 if filename.endswith(u'.part'):
443 return filename[:-len(u'.part')]
446 def try_rename(self, old_filename, new_filename):
448 if old_filename == new_filename:
450 os.rename(old_filename, new_filename)
451 except (IOError, OSError), err:
452 self.trouble(u'ERROR: unable to rename file')
454 def report_destination(self, filename):
455 """Report destination filename."""
456 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
458 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
459 """Report download progress."""
460 if self.params.get('noprogress', False):
462 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
463 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
464 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
465 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
467 def report_resuming_byte(self, resume_len):
468 """Report attempt to resume at given byte."""
469 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
471 def report_retry(self, count, retries):
472 """Report retry in case of HTTP error 5xx"""
473 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
475 def report_file_already_downloaded(self, file_name):
476 """Report file has already been fully downloaded."""
478 self.to_screen(u'[download] %s has already been downloaded' % file_name)
479 except (UnicodeEncodeError), err:
480 self.to_screen(u'[download] The file has already been downloaded')
482 def report_unable_to_resume(self):
483 """Report it was impossible to resume download."""
484 self.to_screen(u'[download] Unable to resume')
486 def report_finish(self):
487 """Report download finished."""
488 if self.params.get('noprogress', False):
489 self.to_screen(u'[download] Download completed')
493 def increment_downloads(self):
494 """Increment the ordinal that assigns a number to each file."""
495 self._num_downloads += 1
497 def process_info(self, info_dict):
498 """Process a single dictionary returned by an InfoExtractor."""
499 # Do nothing else if in simulate mode
500 if self.params.get('simulate', False):
502 if self.params.get('forcetitle', False):
503 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
504 if self.params.get('forceurl', False):
505 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
506 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
507 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
508 if self.params.get('forcedescription', False) and 'description' in info_dict:
509 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
514 template_dict = dict(info_dict)
515 template_dict['epoch'] = unicode(long(time.time()))
516 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
517 filename = self.params['outtmpl'] % template_dict
518 except (ValueError, KeyError), err:
519 self.trouble(u'ERROR: invalid system charset or erroneous output template')
521 if self.params.get('nooverwrites', False) and os.path.exists(filename):
522 self.to_stderr(u'WARNING: file exists and will be skipped')
526 self.pmkdir(filename)
527 except (OSError, IOError), err:
528 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
532 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
533 except (OSError, IOError), err:
534 raise UnavailableVideoError
535 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
536 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
538 except (ContentTooShortError, ), err:
539 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
544 self.post_process(filename, info_dict)
545 except (PostProcessingError), err:
546 self.trouble(u'ERROR: postprocessing: %s' % str(err))
549 def download(self, url_list):
550 """Download a given list of URLs."""
551 if len(url_list) > 1 and self.fixed_template():
552 raise SameFileError(self.params['outtmpl'])
555 suitable_found = False
557 # Go to next InfoExtractor if not suitable
558 if not ie.suitable(url):
561 # Suitable InfoExtractor found
562 suitable_found = True
564 # Extract information from URL and process it
567 # Suitable InfoExtractor had been found; go to next URL
570 if not suitable_found:
571 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
573 return self._download_retcode
575 def post_process(self, filename, ie_info):
576 """Run the postprocessing chain on the given file."""
578 info['filepath'] = filename
584 def _download_with_rtmpdump(self, filename, url, player_url):
585 self.report_destination(filename)
586 tmpfilename = self.temp_name(filename)
588 # Check for rtmpdump first
590 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
591 except (OSError, IOError):
592 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
595 # Download using rtmpdump. rtmpdump returns exit code 2 when
596 # the connection was interrumpted and resuming appears to be
597 # possible. This is part of rtmpdump's normal usage, AFAIK.
598 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
599 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
600 while retval == 2 or retval == 1:
601 prevsize = os.path.getsize(tmpfilename)
602 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
603 time.sleep(5.0) # This seems to be needed
604 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
605 cursize = os.path.getsize(tmpfilename)
606 if prevsize == cursize and retval == 1:
609 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
610 self.try_rename(tmpfilename, filename)
613 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
616 def _do_download(self, filename, url, player_url):
617 # Check file already present
618 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
619 self.report_file_already_downloaded(filename)
622 # Attempt to download using rtmpdump
623 if url.startswith('rtmp'):
624 return self._download_with_rtmpdump(filename, url, player_url)
626 tmpfilename = self.temp_name(filename)
630 # Do not include the Accept-Encoding header
631 headers = {'Youtubedl-no-compression': 'True'}
632 basic_request = urllib2.Request(url, None, headers)
633 request = urllib2.Request(url, None, headers)
635 # Establish possible resume length
636 if os.path.isfile(tmpfilename):
637 resume_len = os.path.getsize(tmpfilename)
641 # Request parameters in case of being able to resume
642 if self.params.get('continuedl', False) and resume_len != 0:
643 self.report_resuming_byte(resume_len)
644 request.add_header('Range','bytes=%d-' % resume_len)
648 retries = self.params.get('retries', 0)
649 while count <= retries:
650 # Establish connection
652 data = urllib2.urlopen(request)
654 except (urllib2.HTTPError, ), err:
655 if (err.code < 500 or err.code >= 600) and err.code != 416:
656 # Unexpected HTTP error
658 elif err.code == 416:
659 # Unable to resume (requested range not satisfiable)
661 # Open the connection again without the range header
662 data = urllib2.urlopen(basic_request)
663 content_length = data.info()['Content-Length']
664 except (urllib2.HTTPError, ), err:
665 if err.code < 500 or err.code >= 600:
668 # Examine the reported length
669 if (content_length is not None and
670 (resume_len - 100 < long(content_length) < resume_len + 100)):
671 # The file had already been fully downloaded.
672 # Explanation to the above condition: in issue #175 it was revealed that
673 # YouTube sometimes adds or removes a few bytes from the end of the file,
674 # changing the file size slightly and causing problems for some users. So
675 # I decided to implement a suggested change and consider the file
676 # completely downloaded if the file size differs less than 100 bytes from
677 # the one in the hard drive.
678 self.report_file_already_downloaded(filename)
679 self.try_rename(tmpfilename, filename)
682 # The length does not match, we start the download over
683 self.report_unable_to_resume()
689 self.report_retry(count, retries)
692 self.trouble(u'ERROR: giving up after %s retries' % retries)
695 data_len = data.info().get('Content-length', None)
696 if data_len is not None:
697 data_len = long(data_len) + resume_len
698 data_len_str = self.format_bytes(data_len)
699 byte_counter = 0 + resume_len
705 data_block = data.read(block_size)
707 if len(data_block) == 0:
709 byte_counter += len(data_block)
711 # Open file just in time
714 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
715 filename = self.undo_temp_name(tmpfilename)
716 self.report_destination(filename)
717 except (OSError, IOError), err:
718 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
721 stream.write(data_block)
722 except (IOError, OSError), err:
723 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
725 block_size = self.best_block_size(after - before, len(data_block))
728 percent_str = self.calc_percent(byte_counter, data_len)
729 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
730 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
731 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
734 self.slow_down(start, byte_counter - resume_len)
738 if data_len is not None and byte_counter != data_len:
739 raise ContentTooShortError(byte_counter, long(data_len))
740 self.try_rename(tmpfilename, filename)
743 class InfoExtractor(object):
744 """Information Extractor class.
746 Information extractors are the classes that, given a URL, extract
747 information from the video (or videos) the URL refers to. This
748 information includes the real video URL, the video title and simplified
749 title, author and others. The information is stored in a dictionary
750 which is then passed to the FileDownloader. The FileDownloader
751 processes this information possibly downloading the video to the file
752 system, among other possible outcomes. The dictionaries must include
753 the following fields:
755 id: Video identifier.
756 url: Final video URL.
757 uploader: Nickname of the video uploader.
758 title: Literal title.
759 stitle: Simplified title.
760 ext: Video filename extension.
761 format: Video format.
762 player_url: SWF Player URL (may be None).
764 The following fields are optional. Their primary purpose is to allow
765 youtube-dl to serve as the backend for a video search function, such
766 as the one in youtube2mp3. They are only used when their respective
767 forced printing functions are called:
769 thumbnail: Full URL to a video thumbnail image.
770 description: One-line video description.
772 Subclasses of this one should re-define the _real_initialize() and
773 _real_extract() methods, as well as the suitable() static method.
774 Probably, they should also be instantiated and added to the main
781 def __init__(self, downloader=None):
782 """Constructor. Receives an optional downloader."""
784 self.set_downloader(downloader)
788 """Receives a URL and returns True if suitable for this IE."""
791 def initialize(self):
792 """Initializes an instance (authentication, etc)."""
794 self._real_initialize()
797 def extract(self, url):
798 """Extracts URL information and returns it in list of dicts."""
800 return self._real_extract(url)
802 def set_downloader(self, downloader):
803 """Sets the downloader for this IE."""
804 self._downloader = downloader
806 def _real_initialize(self):
807 """Real initialization process. Redefine in subclasses."""
810 def _real_extract(self, url):
811 """Real extraction process. Redefine in subclasses."""
814 class YoutubeIE(InfoExtractor):
815 """Information extractor for youtube.com."""
817 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
818 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
819 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
820 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
821 _NETRC_MACHINE = 'youtube'
822 # Listed in order of quality
823 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
824 _video_extensions = {
830 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
837 return (re.match(YoutubeIE._VALID_URL, url) is not None)
839 def report_lang(self):
840 """Report attempt to set language."""
841 self._downloader.to_screen(u'[youtube] Setting language')
843 def report_login(self):
844 """Report attempt to log in."""
845 self._downloader.to_screen(u'[youtube] Logging in')
847 def report_age_confirmation(self):
848 """Report attempt to confirm age."""
849 self._downloader.to_screen(u'[youtube] Confirming age')
851 def report_video_webpage_download(self, video_id):
852 """Report attempt to download video webpage."""
853 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
855 def report_video_info_webpage_download(self, video_id):
856 """Report attempt to download video info webpage."""
857 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
859 def report_information_extraction(self, video_id):
860 """Report attempt to extract video information."""
861 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
863 def report_unavailable_format(self, video_id, format):
864 """Report extracted video URL."""
865 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
867 def report_rtmp_download(self):
868 """Indicate the download will use the RTMP protocol."""
869 self._downloader.to_screen(u'[youtube] RTMP download detected')
871 def _real_initialize(self):
872 if self._downloader is None:
877 downloader_params = self._downloader.params
879 # Attempt to use provided username and password or .netrc data
880 if downloader_params.get('username', None) is not None:
881 username = downloader_params['username']
882 password = downloader_params['password']
883 elif downloader_params.get('usenetrc', False):
885 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
890 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
891 except (IOError, netrc.NetrcParseError), err:
892 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
896 request = urllib2.Request(self._LANG_URL)
899 urllib2.urlopen(request).read()
900 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
901 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
904 # No authentication to be performed
910 'current_form': 'loginForm',
912 'action_login': 'Log In',
913 'username': username,
914 'password': password,
916 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
919 login_results = urllib2.urlopen(request).read()
920 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
921 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
923 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
924 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
930 'action_confirm': 'Confirm',
932 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
934 self.report_age_confirmation()
935 age_results = urllib2.urlopen(request).read()
936 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
937 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
940 def _real_extract(self, url):
941 # Extract video id from URL
942 mobj = re.match(self._VALID_URL, url)
944 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
946 video_id = mobj.group(2)
949 self.report_video_webpage_download(video_id)
950 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
952 video_webpage = urllib2.urlopen(request).read()
953 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
954 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
957 # Attempt to extract SWF player URL
958 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
960 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
965 self.report_video_info_webpage_download(video_id)
966 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
967 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
968 % (video_id, el_type))
969 request = urllib2.Request(video_info_url)
971 video_info_webpage = urllib2.urlopen(request).read()
972 video_info = parse_qs(video_info_webpage)
973 if 'token' in video_info:
975 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
976 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
978 if 'token' not in video_info:
979 if 'reason' in video_info:
980 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
982 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
985 # Start extracting information
986 self.report_information_extraction(video_id)
989 if 'author' not in video_info:
990 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
992 video_uploader = urllib.unquote_plus(video_info['author'][0])
995 if 'title' not in video_info:
996 self._downloader.trouble(u'ERROR: unable to extract video title')
998 video_title = urllib.unquote_plus(video_info['title'][0])
999 video_title = video_title.decode('utf-8')
1000 video_title = sanitize_title(video_title)
1003 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1004 simple_title = simple_title.strip(ur'_')
1007 if 'thumbnail_url' not in video_info:
1008 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1009 video_thumbnail = ''
1010 else: # don't panic if we can't find it
1011 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1015 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1016 if mobj is not None:
1017 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1018 format_expressions = ['%d %B %Y', '%B %d %Y']
1019 for expression in format_expressions:
1021 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1026 video_description = 'No description available.'
1027 if self._downloader.params.get('forcedescription', False):
1028 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1029 if mobj is not None:
1030 video_description = mobj.group(1)
1033 video_token = urllib.unquote_plus(video_info['token'][0])
1035 # Decide which formats to download
1036 req_format = self._downloader.params.get('format', None)
1038 if 'fmt_url_map' in video_info:
1039 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1040 format_limit = self._downloader.params.get('format_limit', None)
1041 if format_limit is not None and format_limit in self._available_formats:
1042 format_list = self._available_formats[self._available_formats.index(format_limit):]
1044 format_list = self._available_formats
1045 existing_formats = [x for x in format_list if x in url_map]
1046 if len(existing_formats) == 0:
1047 self._downloader.trouble(u'ERROR: no known formats available for video')
1049 if req_format is None:
1050 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1051 elif req_format == '-1':
1052 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1055 if req_format not in url_map:
1056 self._downloader.trouble(u'ERROR: requested format not available')
1058 video_url_list = [(req_format, url_map[req_format])] # Specific format
1060 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1061 self.report_rtmp_download()
1062 video_url_list = [(None, video_info['conn'][0])]
1065 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1068 for format_param, video_real_url in video_url_list:
1069 # At this point we have a new video
1070 self._downloader.increment_downloads()
1073 video_extension = self._video_extensions.get(format_param, 'flv')
1075 # Find the video URL in fmt_url_map or conn paramters
1077 # Process video information
1078 self._downloader.process_info({
1079 'id': video_id.decode('utf-8'),
1080 'url': video_real_url.decode('utf-8'),
1081 'uploader': video_uploader.decode('utf-8'),
1082 'upload_date': upload_date,
1083 'title': video_title,
1084 'stitle': simple_title,
1085 'ext': video_extension.decode('utf-8'),
1086 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1087 'thumbnail': video_thumbnail.decode('utf-8'),
1088 'description': video_description.decode('utf-8'),
1089 'player_url': player_url,
1091 except UnavailableVideoError, err:
1092 self._downloader.trouble(u'\nERROR: unable to download video')
1095 class MetacafeIE(InfoExtractor):
1096 """Information Extractor for metacafe.com."""
1098 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1099 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1100 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1103 def __init__(self, youtube_ie, downloader=None):
1104 InfoExtractor.__init__(self, downloader)
1105 self._youtube_ie = youtube_ie
1109 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1111 def report_disclaimer(self):
1112 """Report disclaimer retrieval."""
1113 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1115 def report_age_confirmation(self):
1116 """Report attempt to confirm age."""
1117 self._downloader.to_screen(u'[metacafe] Confirming age')
1119 def report_download_webpage(self, video_id):
1120 """Report webpage download."""
1121 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1123 def report_extraction(self, video_id):
1124 """Report information extraction."""
1125 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1127 def _real_initialize(self):
1128 # Retrieve disclaimer
1129 request = urllib2.Request(self._DISCLAIMER)
1131 self.report_disclaimer()
1132 disclaimer = urllib2.urlopen(request).read()
1133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1134 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1140 'submit': "Continue - I'm over 18",
1142 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1144 self.report_age_confirmation()
1145 disclaimer = urllib2.urlopen(request).read()
1146 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1147 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1150 def _real_extract(self, url):
1151 # Extract id and simplified title from URL
1152 mobj = re.match(self._VALID_URL, url)
1154 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1157 video_id = mobj.group(1)
1159 # Check if video comes from YouTube
1160 mobj2 = re.match(r'^yt-(.*)$', video_id)
1161 if mobj2 is not None:
1162 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1165 # At this point we have a new video
1166 self._downloader.increment_downloads()
1168 simple_title = mobj.group(2).decode('utf-8')
1170 # Retrieve video webpage to extract further information
1171 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1173 self.report_download_webpage(video_id)
1174 webpage = urllib2.urlopen(request).read()
1175 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1176 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1179 # Extract URL, uploader and title from webpage
1180 self.report_extraction(video_id)
1181 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1182 if mobj is not None:
1183 mediaURL = urllib.unquote(mobj.group(1))
1184 video_extension = mediaURL[-3:]
1186 # Extract gdaKey if available
1187 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1189 video_url = mediaURL
1191 gdaKey = mobj.group(1)
1192 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1194 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1196 self._downloader.trouble(u'ERROR: unable to extract media URL')
1198 vardict = parse_qs(mobj.group(1))
1199 if 'mediaData' not in vardict:
1200 self._downloader.trouble(u'ERROR: unable to extract media URL')
1202 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1204 self._downloader.trouble(u'ERROR: unable to extract media URL')
1206 mediaURL = mobj.group(1).replace('\\/', '/')
1207 video_extension = mediaURL[-3:]
1208 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1210 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1212 self._downloader.trouble(u'ERROR: unable to extract title')
1214 video_title = mobj.group(1).decode('utf-8')
1215 video_title = sanitize_title(video_title)
1217 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1219 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1221 video_uploader = mobj.group(1)
1224 # Process video information
1225 self._downloader.process_info({
1226 'id': video_id.decode('utf-8'),
1227 'url': video_url.decode('utf-8'),
1228 'uploader': video_uploader.decode('utf-8'),
1229 'upload_date': u'NA',
1230 'title': video_title,
1231 'stitle': simple_title,
1232 'ext': video_extension.decode('utf-8'),
1236 except UnavailableVideoError:
1237 self._downloader.trouble(u'\nERROR: unable to download video')
1240 class DailymotionIE(InfoExtractor):
1241 """Information Extractor for Dailymotion"""
1243 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1245 def __init__(self, downloader=None):
1246 InfoExtractor.__init__(self, downloader)
1250 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1252 def report_download_webpage(self, video_id):
1253 """Report webpage download."""
1254 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1256 def report_extraction(self, video_id):
1257 """Report information extraction."""
1258 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1260 def _real_initialize(self):
1263 def _real_extract(self, url):
1264 # Extract id and simplified title from URL
1265 mobj = re.match(self._VALID_URL, url)
1267 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1270 # At this point we have a new video
1271 self._downloader.increment_downloads()
1272 video_id = mobj.group(1)
1274 simple_title = mobj.group(2).decode('utf-8')
1275 video_extension = 'flv'
1277 # Retrieve video webpage to extract further information
1278 request = urllib2.Request(url)
1280 self.report_download_webpage(video_id)
1281 webpage = urllib2.urlopen(request).read()
1282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1283 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1286 # Extract URL, uploader and title from webpage
1287 self.report_extraction(video_id)
1288 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1290 self._downloader.trouble(u'ERROR: unable to extract media URL')
1292 mediaURL = urllib.unquote(mobj.group(1))
1294 # if needed add http://www.dailymotion.com/ if relative URL
1296 video_url = mediaURL
1298 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1299 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1301 self._downloader.trouble(u'ERROR: unable to extract title')
1303 video_title = mobj.group(1).decode('utf-8')
1304 video_title = sanitize_title(video_title)
1306 mobj = re.search(r'(?im)<Attribute name="owner">(.+?)</Attribute>', webpage)
1308 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1310 video_uploader = mobj.group(1)
1313 # Process video information
1314 self._downloader.process_info({
1315 'id': video_id.decode('utf-8'),
1316 'url': video_url.decode('utf-8'),
1317 'uploader': video_uploader.decode('utf-8'),
1318 'upload_date': u'NA',
1319 'title': video_title,
1320 'stitle': simple_title,
1321 'ext': video_extension.decode('utf-8'),
1325 except UnavailableVideoError:
1326 self._downloader.trouble(u'\nERROR: unable to download video')
1328 class GoogleIE(InfoExtractor):
1329 """Information extractor for video.google.com."""
1331 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1333 def __init__(self, downloader=None):
1334 InfoExtractor.__init__(self, downloader)
1338 return (re.match(GoogleIE._VALID_URL, url) is not None)
1340 def report_download_webpage(self, video_id):
1341 """Report webpage download."""
1342 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1344 def report_extraction(self, video_id):
1345 """Report information extraction."""
1346 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1348 def _real_initialize(self):
1351 def _real_extract(self, url):
1352 # Extract id from URL
1353 mobj = re.match(self._VALID_URL, url)
1355 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1358 # At this point we have a new video
1359 self._downloader.increment_downloads()
1360 video_id = mobj.group(1)
1362 video_extension = 'mp4'
1364 # Retrieve video webpage to extract further information
1365 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1367 self.report_download_webpage(video_id)
1368 webpage = urllib2.urlopen(request).read()
1369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1370 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1373 # Extract URL, uploader, and title from webpage
1374 self.report_extraction(video_id)
1375 mobj = re.search(r"download_url:'([^']+)'", webpage)
1377 video_extension = 'flv'
1378 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1380 self._downloader.trouble(u'ERROR: unable to extract media URL')
1382 mediaURL = urllib.unquote(mobj.group(1))
1383 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1384 mediaURL = mediaURL.replace('\\x26', '\x26')
1386 video_url = mediaURL
1388 mobj = re.search(r'<title>(.*)</title>', webpage)
1390 self._downloader.trouble(u'ERROR: unable to extract title')
1392 video_title = mobj.group(1).decode('utf-8')
1393 video_title = sanitize_title(video_title)
1394 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1396 # Extract video description
1397 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1399 self._downloader.trouble(u'ERROR: unable to extract video description')
1401 video_description = mobj.group(1).decode('utf-8')
1402 if not video_description:
1403 video_description = 'No description available.'
1405 # Extract video thumbnail
1406 if self._downloader.params.get('forcethumbnail', False):
1407 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1409 webpage = urllib2.urlopen(request).read()
1410 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1411 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1413 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1415 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1417 video_thumbnail = mobj.group(1)
1418 else: # we need something to pass to process_info
1419 video_thumbnail = ''
1423 # Process video information
1424 self._downloader.process_info({
1425 'id': video_id.decode('utf-8'),
1426 'url': video_url.decode('utf-8'),
1428 'upload_date': u'NA',
1429 'title': video_title,
1430 'stitle': simple_title,
1431 'ext': video_extension.decode('utf-8'),
1435 except UnavailableVideoError:
1436 self._downloader.trouble(u'\nERROR: unable to download video')
1439 class PhotobucketIE(InfoExtractor):
1440 """Information extractor for photobucket.com."""
1442 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1444 def __init__(self, downloader=None):
1445 InfoExtractor.__init__(self, downloader)
1449 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1451 def report_download_webpage(self, video_id):
1452 """Report webpage download."""
1453 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1455 def report_extraction(self, video_id):
1456 """Report information extraction."""
1457 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1459 def _real_initialize(self):
1462 def _real_extract(self, url):
1463 # Extract id from URL
1464 mobj = re.match(self._VALID_URL, url)
1466 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1469 # At this point we have a new video
1470 self._downloader.increment_downloads()
1471 video_id = mobj.group(1)
1473 video_extension = 'flv'
1475 # Retrieve video webpage to extract further information
1476 request = urllib2.Request(url)
1478 self.report_download_webpage(video_id)
1479 webpage = urllib2.urlopen(request).read()
1480 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1481 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1484 # Extract URL, uploader, and title from webpage
1485 self.report_extraction(video_id)
1486 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1488 self._downloader.trouble(u'ERROR: unable to extract media URL')
1490 mediaURL = urllib.unquote(mobj.group(1))
1492 video_url = mediaURL
1494 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1496 self._downloader.trouble(u'ERROR: unable to extract title')
1498 video_title = mobj.group(1).decode('utf-8')
1499 video_title = sanitize_title(video_title)
1500 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1502 video_uploader = mobj.group(2).decode('utf-8')
1505 # Process video information
1506 self._downloader.process_info({
1507 'id': video_id.decode('utf-8'),
1508 'url': video_url.decode('utf-8'),
1509 'uploader': video_uploader,
1510 'upload_date': u'NA',
1511 'title': video_title,
1512 'stitle': simple_title,
1513 'ext': video_extension.decode('utf-8'),
1517 except UnavailableVideoError:
1518 self._downloader.trouble(u'\nERROR: unable to download video')
1521 class YahooIE(InfoExtractor):
1522 """Information extractor for video.yahoo.com."""
1524 # _VALID_URL matches all Yahoo! Video URLs
1525 # _VPAGE_URL matches only the extractable '/watch/' URLs
1526 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1527 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1529 def __init__(self, downloader=None):
1530 InfoExtractor.__init__(self, downloader)
1534 return (re.match(YahooIE._VALID_URL, url) is not None)
1536 def report_download_webpage(self, video_id):
1537 """Report webpage download."""
1538 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1540 def report_extraction(self, video_id):
1541 """Report information extraction."""
1542 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1544 def _real_initialize(self):
1547 def _real_extract(self, url, new_video=True):
1548 # Extract ID from URL
1549 mobj = re.match(self._VALID_URL, url)
1551 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1554 # At this point we have a new video
1555 self._downloader.increment_downloads()
1556 video_id = mobj.group(2)
1557 video_extension = 'flv'
1559 # Rewrite valid but non-extractable URLs as
1560 # extractable English language /watch/ URLs
1561 if re.match(self._VPAGE_URL, url) is None:
1562 request = urllib2.Request(url)
1564 webpage = urllib2.urlopen(request).read()
1565 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1566 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1569 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1571 self._downloader.trouble(u'ERROR: Unable to extract id field')
1573 yahoo_id = mobj.group(1)
1575 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1577 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1579 yahoo_vid = mobj.group(1)
1581 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1582 return self._real_extract(url, new_video=False)
1584 # Retrieve video webpage to extract further information
1585 request = urllib2.Request(url)
1587 self.report_download_webpage(video_id)
1588 webpage = urllib2.urlopen(request).read()
1589 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1590 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1593 # Extract uploader and title from webpage
1594 self.report_extraction(video_id)
1595 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1597 self._downloader.trouble(u'ERROR: unable to extract video title')
1599 video_title = mobj.group(1).decode('utf-8')
1600 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1602 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1604 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1606 video_uploader = mobj.group(1).decode('utf-8')
1608 # Extract video thumbnail
1609 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1611 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1613 video_thumbnail = mobj.group(1).decode('utf-8')
1615 # Extract video description
1616 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1618 self._downloader.trouble(u'ERROR: unable to extract video description')
1620 video_description = mobj.group(1).decode('utf-8')
1621 if not video_description: video_description = 'No description available.'
1623 # Extract video height and width
1624 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1626 self._downloader.trouble(u'ERROR: unable to extract video height')
1628 yv_video_height = mobj.group(1)
1630 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1632 self._downloader.trouble(u'ERROR: unable to extract video width')
1634 yv_video_width = mobj.group(1)
1636 # Retrieve video playlist to extract media URL
1637 # I'm not completely sure what all these options are, but we
1638 # seem to need most of them, otherwise the server sends a 401.
1639 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1640 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1641 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1642 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1643 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1645 self.report_download_webpage(video_id)
1646 webpage = urllib2.urlopen(request).read()
1647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1651 # Extract media URL from playlist XML
1652 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1654 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1656 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1657 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1660 # Process video information
1661 self._downloader.process_info({
1662 'id': video_id.decode('utf-8'),
1664 'uploader': video_uploader,
1665 'upload_date': u'NA',
1666 'title': video_title,
1667 'stitle': simple_title,
1668 'ext': video_extension.decode('utf-8'),
1669 'thumbnail': video_thumbnail.decode('utf-8'),
1670 'description': video_description,
1671 'thumbnail': video_thumbnail,
1672 'description': video_description,
1675 except UnavailableVideoError:
1676 self._downloader.trouble(u'\nERROR: unable to download video')
1679 class GenericIE(InfoExtractor):
1680 """Generic last-resort information extractor."""
1682 def __init__(self, downloader=None):
1683 InfoExtractor.__init__(self, downloader)
1689 def report_download_webpage(self, video_id):
1690 """Report webpage download."""
1691 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1692 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1694 def report_extraction(self, video_id):
1695 """Report information extraction."""
1696 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1698 def _real_initialize(self):
1701 def _real_extract(self, url):
1702 # At this point we have a new video
1703 self._downloader.increment_downloads()
1705 video_id = url.split('/')[-1]
1706 request = urllib2.Request(url)
1708 self.report_download_webpage(video_id)
1709 webpage = urllib2.urlopen(request).read()
1710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1711 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1713 except ValueError, err:
1714 # since this is the last-resort InfoExtractor, if
1715 # this error is thrown, it'll be thrown here
1716 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1719 self.report_extraction(video_id)
1720 # Start with something easy: JW Player in SWFObject
1721 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1723 # Broaden the search a little bit
1724 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1726 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1729 # It's possible that one of the regexes
1730 # matched, but returned an empty group:
1731 if mobj.group(1) is None:
1732 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1735 video_url = urllib.unquote(mobj.group(1))
1736 video_id = os.path.basename(video_url)
1738 # here's a fun little line of code for you:
1739 video_extension = os.path.splitext(video_id)[1][1:]
1740 video_id = os.path.splitext(video_id)[0]
1742 # it's tempting to parse this further, but you would
1743 # have to take into account all the variations like
1744 # Video Title - Site Name
1745 # Site Name | Video Title
1746 # Video Title - Tagline | Site Name
1747 # and so on and so forth; it's just not practical
1748 mobj = re.search(r'<title>(.*)</title>', webpage)
1750 self._downloader.trouble(u'ERROR: unable to extract title')
1752 video_title = mobj.group(1).decode('utf-8')
1753 video_title = sanitize_title(video_title)
1754 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1756 # video uploader is domain name
1757 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1759 self._downloader.trouble(u'ERROR: unable to extract title')
1761 video_uploader = mobj.group(1).decode('utf-8')
1764 # Process video information
1765 self._downloader.process_info({
1766 'id': video_id.decode('utf-8'),
1767 'url': video_url.decode('utf-8'),
1768 'uploader': video_uploader,
1769 'upload_date': u'NA',
1770 'title': video_title,
1771 'stitle': simple_title,
1772 'ext': video_extension.decode('utf-8'),
1776 except UnavailableVideoError, err:
1777 self._downloader.trouble(u'\nERROR: unable to download video')
1780 class YoutubeSearchIE(InfoExtractor):
1781 """Information Extractor for YouTube search queries."""
1782 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1783 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1784 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1785 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1787 _max_youtube_results = 1000
1789 def __init__(self, youtube_ie, downloader=None):
1790 InfoExtractor.__init__(self, downloader)
1791 self._youtube_ie = youtube_ie
1795 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1797 def report_download_page(self, query, pagenum):
1798 """Report attempt to download playlist page with given number."""
1799 query = query.decode(preferredencoding())
1800 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1802 def _real_initialize(self):
1803 self._youtube_ie.initialize()
1805 def _real_extract(self, query):
1806 mobj = re.match(self._VALID_QUERY, query)
1808 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1811 prefix, query = query.split(':')
1813 query = query.encode('utf-8')
1815 self._download_n_results(query, 1)
1817 elif prefix == 'all':
1818 self._download_n_results(query, self._max_youtube_results)
1824 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1826 elif n > self._max_youtube_results:
1827 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1828 n = self._max_youtube_results
1829 self._download_n_results(query, n)
1831 except ValueError: # parsing prefix as integer fails
1832 self._download_n_results(query, 1)
1835 def _download_n_results(self, query, n):
1836 """Downloads a specified number of results for a query"""
1839 already_seen = set()
1843 self.report_download_page(query, pagenum)
1844 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1845 request = urllib2.Request(result_url)
1847 page = urllib2.urlopen(request).read()
1848 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1849 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1852 # Extract video identifiers
1853 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1854 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1855 if video_id not in already_seen:
1856 video_ids.append(video_id)
1857 already_seen.add(video_id)
1858 if len(video_ids) == n:
1859 # Specified n videos reached
1860 for id in video_ids:
1861 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1864 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1865 for id in video_ids:
1866 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1869 pagenum = pagenum + 1
1871 class GoogleSearchIE(InfoExtractor):
1872 """Information Extractor for Google Video search queries."""
1873 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1874 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1875 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1876 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1878 _max_google_results = 1000
1880 def __init__(self, google_ie, downloader=None):
1881 InfoExtractor.__init__(self, downloader)
1882 self._google_ie = google_ie
1886 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1888 def report_download_page(self, query, pagenum):
1889 """Report attempt to download playlist page with given number."""
1890 query = query.decode(preferredencoding())
1891 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1893 def _real_initialize(self):
1894 self._google_ie.initialize()
1896 def _real_extract(self, query):
1897 mobj = re.match(self._VALID_QUERY, query)
1899 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1902 prefix, query = query.split(':')
1904 query = query.encode('utf-8')
1906 self._download_n_results(query, 1)
1908 elif prefix == 'all':
1909 self._download_n_results(query, self._max_google_results)
1915 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1917 elif n > self._max_google_results:
1918 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1919 n = self._max_google_results
1920 self._download_n_results(query, n)
1922 except ValueError: # parsing prefix as integer fails
1923 self._download_n_results(query, 1)
1926 def _download_n_results(self, query, n):
1927 """Downloads a specified number of results for a query"""
1930 already_seen = set()
1934 self.report_download_page(query, pagenum)
1935 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1936 request = urllib2.Request(result_url)
1938 page = urllib2.urlopen(request).read()
1939 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1940 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1943 # Extract video identifiers
1944 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1945 video_id = mobj.group(1)
1946 if video_id not in already_seen:
1947 video_ids.append(video_id)
1948 already_seen.add(video_id)
1949 if len(video_ids) == n:
1950 # Specified n videos reached
1951 for id in video_ids:
1952 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1955 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1956 for id in video_ids:
1957 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1960 pagenum = pagenum + 1
1962 class YahooSearchIE(InfoExtractor):
1963 """Information Extractor for Yahoo! Video search queries."""
1964 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1965 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1966 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1967 _MORE_PAGES_INDICATOR = r'\s*Next'
1969 _max_yahoo_results = 1000
1971 def __init__(self, yahoo_ie, downloader=None):
1972 InfoExtractor.__init__(self, downloader)
1973 self._yahoo_ie = yahoo_ie
1977 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1979 def report_download_page(self, query, pagenum):
1980 """Report attempt to download playlist page with given number."""
1981 query = query.decode(preferredencoding())
1982 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1984 def _real_initialize(self):
1985 self._yahoo_ie.initialize()
1987 def _real_extract(self, query):
1988 mobj = re.match(self._VALID_QUERY, query)
1990 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1993 prefix, query = query.split(':')
1995 query = query.encode('utf-8')
1997 self._download_n_results(query, 1)
1999 elif prefix == 'all':
2000 self._download_n_results(query, self._max_yahoo_results)
2006 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2008 elif n > self._max_yahoo_results:
2009 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2010 n = self._max_yahoo_results
2011 self._download_n_results(query, n)
2013 except ValueError: # parsing prefix as integer fails
2014 self._download_n_results(query, 1)
2017 def _download_n_results(self, query, n):
2018 """Downloads a specified number of results for a query"""
2021 already_seen = set()
2025 self.report_download_page(query, pagenum)
2026 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2027 request = urllib2.Request(result_url)
2029 page = urllib2.urlopen(request).read()
2030 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2031 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2034 # Extract video identifiers
2035 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2036 video_id = mobj.group(1)
2037 if video_id not in already_seen:
2038 video_ids.append(video_id)
2039 already_seen.add(video_id)
2040 if len(video_ids) == n:
2041 # Specified n videos reached
2042 for id in video_ids:
2043 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2046 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2047 for id in video_ids:
2048 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2051 pagenum = pagenum + 1
2053 class YoutubePlaylistIE(InfoExtractor):
2054 """Information Extractor for YouTube playlists."""
2056 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2057 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2058 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2059 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2062 def __init__(self, youtube_ie, downloader=None):
2063 InfoExtractor.__init__(self, downloader)
2064 self._youtube_ie = youtube_ie
2068 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2070 def report_download_page(self, playlist_id, pagenum):
2071 """Report attempt to download playlist page with given number."""
2072 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2074 def _real_initialize(self):
2075 self._youtube_ie.initialize()
2077 def _real_extract(self, url):
2078 # Extract playlist id
2079 mobj = re.match(self._VALID_URL, url)
2081 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2084 # Download playlist pages
2085 playlist_id = mobj.group(1)
2090 self.report_download_page(playlist_id, pagenum)
2091 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2093 page = urllib2.urlopen(request).read()
2094 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2095 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2098 # Extract video identifiers
2100 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2101 if mobj.group(1) not in ids_in_page:
2102 ids_in_page.append(mobj.group(1))
2103 video_ids.extend(ids_in_page)
2105 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2107 pagenum = pagenum + 1
2109 playliststart = self._downloader.params.get('playliststart', 1) - 1
2110 playlistend = self._downloader.params.get('playlistend', -1)
2111 video_ids = video_ids[playliststart:playlistend]
2113 for id in video_ids:
2114 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2117 class YoutubeUserIE(InfoExtractor):
2118 """Information Extractor for YouTube users."""
2120 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2121 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2122 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2125 def __init__(self, youtube_ie, downloader=None):
2126 InfoExtractor.__init__(self, downloader)
2127 self._youtube_ie = youtube_ie
2131 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2133 def report_download_page(self, username):
2134 """Report attempt to download user page."""
2135 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2137 def _real_initialize(self):
2138 self._youtube_ie.initialize()
2140 def _real_extract(self, url):
2142 mobj = re.match(self._VALID_URL, url)
2144 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2147 # Download user page
2148 username = mobj.group(1)
2152 self.report_download_page(username)
2153 request = urllib2.Request(self._TEMPLATE_URL % (username))
2155 page = urllib2.urlopen(request).read()
2156 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2157 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2160 # Extract video identifiers
2163 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2164 if mobj.group(1) not in ids_in_page:
2165 ids_in_page.append(mobj.group(1))
2166 video_ids.extend(ids_in_page)
2168 playliststart = self._downloader.params.get('playliststart', 1) - 1
2169 playlistend = self._downloader.params.get('playlistend', -1)
2170 video_ids = video_ids[playliststart:playlistend]
2172 for id in video_ids:
2173 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2176 class DepositFilesIE(InfoExtractor):
2177 """Information extractor for depositfiles.com"""
2179 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2181 def __init__(self, downloader=None):
2182 InfoExtractor.__init__(self, downloader)
2186 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2188 def report_download_webpage(self, file_id):
2189 """Report webpage download."""
2190 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2192 def report_extraction(self, file_id):
2193 """Report information extraction."""
2194 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2196 def _real_initialize(self):
2199 def _real_extract(self, url):
2200 # At this point we have a new file
2201 self._downloader.increment_downloads()
2203 file_id = url.split('/')[-1]
2204 # Rebuild url in english locale
2205 url = 'http://depositfiles.com/en/files/' + file_id
2207 # Retrieve file webpage with 'Free download' button pressed
2208 free_download_indication = { 'gateway_result' : '1' }
2209 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2211 self.report_download_webpage(file_id)
2212 webpage = urllib2.urlopen(request).read()
2213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2217 # Search for the real file URL
2218 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2219 if (mobj is None) or (mobj.group(1) is None):
2220 # Try to figure out reason of the error.
2221 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2222 if (mobj is not None) and (mobj.group(1) is not None):
2223 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2224 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2226 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2229 file_url = mobj.group(1)
2230 file_extension = os.path.splitext(file_url)[1][1:]
2232 # Search for file title
2233 mobj = re.search(r'<b title="(.*?)">', webpage)
2235 self._downloader.trouble(u'ERROR: unable to extract title')
2237 file_title = mobj.group(1).decode('utf-8')
2240 # Process file information
2241 self._downloader.process_info({
2242 'id': file_id.decode('utf-8'),
2243 'url': file_url.decode('utf-8'),
2245 'upload_date': u'NA',
2246 'title': file_title,
2247 'stitle': file_title,
2248 'ext': file_extension.decode('utf-8'),
2252 except UnavailableVideoError, err:
2253 self._downloader.trouble(u'ERROR: unable to download file')
2255 class PostProcessor(object):
2256 """Post Processor class.
2258 PostProcessor objects can be added to downloaders with their
2259 add_post_processor() method. When the downloader has finished a
2260 successful download, it will take its internal chain of PostProcessors
2261 and start calling the run() method on each one of them, first with
2262 an initial argument and then with the returned value of the previous
2265 The chain will be stopped if one of them ever returns None or the end
2266 of the chain is reached.
2268 PostProcessor objects follow a "mutual registration" process similar
2269 to InfoExtractor objects.
2274 def __init__(self, downloader=None):
2275 self._downloader = downloader
2277 def set_downloader(self, downloader):
2278 """Sets the downloader for this PP."""
2279 self._downloader = downloader
2281 def run(self, information):
2282 """Run the PostProcessor.
2284 The "information" argument is a dictionary like the ones
2285 composed by InfoExtractors. The only difference is that this
2286 one has an extra field called "filepath" that points to the
2289 When this method returns None, the postprocessing chain is
2290 stopped. However, this method may return an information
2291 dictionary that will be passed to the next postprocessing
2292 object in the chain. It can be the one it received after
2293 changing some fields.
2295 In addition, this method may raise a PostProcessingError
2296 exception that will be taken into account by the downloader
2299 return information # by default, do nothing
2301 ### MAIN PROGRAM ###
2302 if __name__ == '__main__':
2304 # Modules needed only when running the main program
2308 # Function to update the program file with the latest version from the repository.
2309 def update_self(downloader, filename):
2310 # Note: downloader only used for options
2311 if not os.access(filename, os.W_OK):
2312 sys.exit('ERROR: no write permissions on %s' % filename)
2314 downloader.to_screen('Updating to latest stable version...')
2316 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2317 latest_version = urllib.urlopen(latest_url).read().strip()
2318 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2319 newcontent = urllib.urlopen(prog_url).read()
2320 except (IOError, OSError), err:
2321 sys.exit('ERROR: unable to download latest version')
2323 stream = open(filename, 'w')
2324 stream.write(newcontent)
2326 except (IOError, OSError), err:
2327 sys.exit('ERROR: unable to overwrite current version')
2328 downloader.to_screen('Updated to version %s' % latest_version)
2330 # Parse command line
2331 parser = optparse.OptionParser(
2332 usage='Usage: %prog [options] url...',
2333 version='2010.12.09',
2334 conflict_handler='resolve',
2337 parser.add_option('-h', '--help',
2338 action='help', help='print this help text and exit')
2339 parser.add_option('-v', '--version',
2340 action='version', help='print program version and exit')
2341 parser.add_option('-U', '--update',
2342 action='store_true', dest='update_self', help='update this program to latest stable version')
2343 parser.add_option('-i', '--ignore-errors',
2344 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2345 parser.add_option('-r', '--rate-limit',
2346 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2347 parser.add_option('-R', '--retries',
2348 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2349 parser.add_option('--playlist-start',
2350 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2351 parser.add_option('--playlist-end',
2352 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2353 parser.add_option('--dump-user-agent',
2354 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2356 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2357 authentication.add_option('-u', '--username',
2358 dest='username', metavar='USERNAME', help='account username')
2359 authentication.add_option('-p', '--password',
2360 dest='password', metavar='PASSWORD', help='account password')
2361 authentication.add_option('-n', '--netrc',
2362 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2363 parser.add_option_group(authentication)
2365 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2366 video_format.add_option('-f', '--format',
2367 action='store', dest='format', metavar='FORMAT', help='video format code')
2368 video_format.add_option('--all-formats',
2369 action='store_const', dest='format', help='download all available video formats', const='-1')
2370 video_format.add_option('--max-quality',
2371 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2372 parser.add_option_group(video_format)
2374 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2375 verbosity.add_option('-q', '--quiet',
2376 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2377 verbosity.add_option('-s', '--simulate',
2378 action='store_true', dest='simulate', help='do not download video', default=False)
2379 verbosity.add_option('-g', '--get-url',
2380 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2381 verbosity.add_option('-e', '--get-title',
2382 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2383 verbosity.add_option('--get-thumbnail',
2384 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2385 verbosity.add_option('--get-description',
2386 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2387 verbosity.add_option('--no-progress',
2388 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2389 verbosity.add_option('--console-title',
2390 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2391 parser.add_option_group(verbosity)
2393 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2394 filesystem.add_option('-t', '--title',
2395 action='store_true', dest='usetitle', help='use title in file name', default=False)
2396 filesystem.add_option('-l', '--literal',
2397 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2398 filesystem.add_option('-A', '--auto-number',
2399 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2400 filesystem.add_option('-o', '--output',
2401 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2402 filesystem.add_option('-a', '--batch-file',
2403 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2404 filesystem.add_option('-w', '--no-overwrites',
2405 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2406 filesystem.add_option('-c', '--continue',
2407 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2408 filesystem.add_option('--cookies',
2409 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2410 filesystem.add_option('--no-part',
2411 action='store_true', dest='nopart', help='do not use .part files', default=False)
2412 parser.add_option_group(filesystem)
2414 (opts, args) = parser.parse_args()
2416 # Open appropriate CookieJar
2417 if opts.cookiefile is None:
2418 jar = cookielib.CookieJar()
2421 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2422 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2424 except (IOError, OSError), err:
2425 sys.exit(u'ERROR: unable to open cookie file')
2428 if opts.dump_user_agent:
2429 print std_headers['User-Agent']
2432 # General configuration
2433 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2434 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2435 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2437 # Batch file verification
2439 if opts.batchfile is not None:
2441 if opts.batchfile == '-':
2444 batchfd = open(opts.batchfile, 'r')
2445 batchurls = batchfd.readlines()
2446 batchurls = [x.strip() for x in batchurls]
2447 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2449 sys.exit(u'ERROR: batch file could not be read')
2450 all_urls = batchurls + args
2452 # Conflicting, missing and erroneous options
2453 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2454 parser.error(u'using .netrc conflicts with giving username/password')
2455 if opts.password is not None and opts.username is None:
2456 parser.error(u'account username missing')
2457 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2458 parser.error(u'using output template conflicts with using title, literal title or auto number')
2459 if opts.usetitle and opts.useliteral:
2460 parser.error(u'using title conflicts with using literal title')
2461 if opts.username is not None and opts.password is None:
2462 opts.password = getpass.getpass(u'Type account password and press return:')
2463 if opts.ratelimit is not None:
2464 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2465 if numeric_limit is None:
2466 parser.error(u'invalid rate limit specified')
2467 opts.ratelimit = numeric_limit
2468 if opts.retries is not None:
2470 opts.retries = long(opts.retries)
2471 except (TypeError, ValueError), err:
2472 parser.error(u'invalid retry count specified')
2474 opts.playliststart = long(opts.playliststart)
2475 if opts.playliststart <= 0:
2477 except (TypeError, ValueError), err:
2478 parser.error(u'invalid playlist start number specified')
2480 opts.playlistend = long(opts.playlistend)
2481 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2483 except (TypeError, ValueError), err:
2484 parser.error(u'invalid playlist end number specified')
2486 # Information extractors
2487 youtube_ie = YoutubeIE()
2488 metacafe_ie = MetacafeIE(youtube_ie)
2489 dailymotion_ie = DailymotionIE()
2490 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2491 youtube_user_ie = YoutubeUserIE(youtube_ie)
2492 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2493 google_ie = GoogleIE()
2494 google_search_ie = GoogleSearchIE(google_ie)
2495 photobucket_ie = PhotobucketIE()
2496 yahoo_ie = YahooIE()
2497 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2498 deposit_files_ie = DepositFilesIE()
2499 generic_ie = GenericIE()
2502 fd = FileDownloader({
2503 'usenetrc': opts.usenetrc,
2504 'username': opts.username,
2505 'password': opts.password,
2506 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2507 'forceurl': opts.geturl,
2508 'forcetitle': opts.gettitle,
2509 'forcethumbnail': opts.getthumbnail,
2510 'forcedescription': opts.getdescription,
2511 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2512 'format': opts.format,
2513 'format_limit': opts.format_limit,
2514 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2515 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2516 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2517 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2518 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2519 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2520 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2521 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2522 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2523 or u'%(id)s.%(ext)s'),
2524 'ignoreerrors': opts.ignoreerrors,
2525 'ratelimit': opts.ratelimit,
2526 'nooverwrites': opts.nooverwrites,
2527 'retries': opts.retries,
2528 'continuedl': opts.continue_dl,
2529 'noprogress': opts.noprogress,
2530 'playliststart': opts.playliststart,
2531 'playlistend': opts.playlistend,
2532 'logtostderr': opts.outtmpl == '-',
2533 'consoletitle': opts.consoletitle,
2534 'nopart': opts.nopart,
2536 fd.add_info_extractor(youtube_search_ie)
2537 fd.add_info_extractor(youtube_pl_ie)
2538 fd.add_info_extractor(youtube_user_ie)
2539 fd.add_info_extractor(metacafe_ie)
2540 fd.add_info_extractor(dailymotion_ie)
2541 fd.add_info_extractor(youtube_ie)
2542 fd.add_info_extractor(google_ie)
2543 fd.add_info_extractor(google_search_ie)
2544 fd.add_info_extractor(photobucket_ie)
2545 fd.add_info_extractor(yahoo_ie)
2546 fd.add_info_extractor(yahoo_search_ie)
2547 fd.add_info_extractor(deposit_files_ie)
2549 # This must come last since it's the
2550 # fallback if none of the others work
2551 fd.add_info_extractor(generic_ie)
2554 if opts.update_self:
2555 update_self(fd, sys.argv[0])
2558 if len(all_urls) < 1:
2559 if not opts.update_self:
2560 parser.error(u'you must provide at least one URL')
2563 retcode = fd.download(all_urls)
2565 # Dump cookie jar if requested
2566 if opts.cookiefile is not None:
2569 except (IOError, OSError), err:
2570 sys.exit(u'ERROR: unable to save cookie jar')
2574 except DownloadError:
2576 except SameFileError:
2577 sys.exit(u'ERROR: fixed output name but more than one file to download')
2578 except KeyboardInterrupt:
2579 sys.exit(u'\nERROR: Interrupted by user')