2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
30 # parse_qs was moved from the cgi module to the urlparse module recently.
32 from urlparse import parse_qs
34 from cgi import parse_qs
37 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
38 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
39 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
40 'Accept-Encoding': 'gzip, deflate',
41 'Accept-Language': 'en-us,en;q=0.5',
44 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
46 def preferredencoding():
47 """Get preferred encoding.
49 Returns the best encoding scheme for the system, based on
50 locale.getpreferredencoding() and some further tweaks.
52 def yield_preferredencoding():
54 pref = locale.getpreferredencoding()
60 return yield_preferredencoding().next()
62 def htmlentity_transform(matchobj):
63 """Transforms an HTML entity to a Unicode character.
65 This function receives a match object and is intended to be used with
66 the re.sub() function.
68 entity = matchobj.group(1)
70 # Known non-numeric HTML entity
71 if entity in htmlentitydefs.name2codepoint:
72 return unichr(htmlentitydefs.name2codepoint[entity])
75 mobj = re.match(ur'(?u)#(x?\d+)', entity)
77 numstr = mobj.group(1)
78 if numstr.startswith(u'x'):
80 numstr = u'0%s' % numstr
83 return unichr(long(numstr, base))
85 # Unknown entity in name, return its literal representation
86 return (u'&%s;' % entity)
88 def sanitize_title(utitle):
89 """Sanitizes a video title so it could be used as part of a filename."""
90 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
91 return utitle.replace(unicode(os.sep), u'%')
93 def sanitize_open(filename, open_mode):
94 """Try to open the given filename, and slightly tweak it if this fails.
96 Attempts to open the given filename. If this fails, it tries to change
97 the filename slightly, step by step, until it's either able to open it
98 or it fails and raises a final exception, like the standard open()
101 It returns the tuple (stream, definitive_file_name).
105 if sys.platform == 'win32':
107 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
108 return (sys.stdout, filename)
109 stream = open(filename, open_mode)
110 return (stream, filename)
111 except (IOError, OSError), err:
112 # In case of error, try to remove win32 forbidden chars
113 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
115 # An exception here should be caught in the caller
116 stream = open(filename, open_mode)
117 return (stream, filename)
119 class DownloadError(Exception):
120 """Download Error exception.
122 This exception may be thrown by FileDownloader objects if they are not
123 configured to continue on errors. They will contain the appropriate
128 class SameFileError(Exception):
129 """Same File exception.
131 This exception will be thrown by FileDownloader objects if they detect
132 multiple files would have to be downloaded to the same file on disk.
136 class PostProcessingError(Exception):
137 """Post Processing exception.
139 This exception may be raised by PostProcessor's .run() method to
140 indicate an error in the postprocessing task.
144 class UnavailableVideoError(Exception):
145 """Unavailable Format exception.
147 This exception will be thrown when a video is requested
148 in a format that is not available for that video.
152 class ContentTooShortError(Exception):
153 """Content Too Short exception.
155 This exception may be raised by FileDownloader objects when a file they
156 download is too small for what the server announced first, indicating
157 the connection was probably interrupted.
163 def __init__(self, downloaded, expected):
164 self.downloaded = downloaded
165 self.expected = expected
167 class YoutubeDLHandler(urllib2.HTTPHandler):
168 """Handler for HTTP requests and responses.
170 This class, when installed with an OpenerDirector, automatically adds
171 the standard headers to every HTTP request and handles gzipped and
172 deflated responses from web servers. If compression is to be avoided in
173 a particular request, the original request in the program code only has
174 to include the HTTP header "Youtubedl-No-Compression", which will be
175 removed before making the real request.
177 Part of this code was copied from:
179 http://techknack.net/python-urllib2-handlers/
181 Andrew Rowls, the author of that code, agreed to release it to the
188 return zlib.decompress(data, -zlib.MAX_WBITS)
190 return zlib.decompress(data)
193 def addinfourl_wrapper(stream, headers, url, code):
194 if hasattr(urllib2.addinfourl, 'getcode'):
195 return urllib2.addinfourl(stream, headers, url, code)
196 ret = urllib2.addinfourl(stream, headers, url)
200 def http_request(self, req):
201 for h in std_headers:
204 req.add_header(h, std_headers[h])
205 if 'Youtubedl-no-compression' in req.headers:
206 if 'Accept-encoding' in req.headers:
207 del req.headers['Accept-encoding']
208 del req.headers['Youtubedl-no-compression']
211 def http_response(self, req, resp):
214 if resp.headers.get('Content-encoding', '') == 'gzip':
215 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
216 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
217 resp.msg = old_resp.msg
219 if resp.headers.get('Content-encoding', '') == 'deflate':
220 gz = StringIO.StringIO(self.deflate(resp.read()))
221 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
222 resp.msg = old_resp.msg
225 class FileDownloader(object):
226 """File Downloader class.
228 File downloader objects are the ones responsible of downloading the
229 actual video file and writing it to disk if the user has requested
230 it, among some other tasks. In most cases there should be one per
231 program. As, given a video URL, the downloader doesn't know how to
232 extract all the needed information, task that InfoExtractors do, it
233 has to pass the URL to one of them.
235 For this, file downloader objects have a method that allows
236 InfoExtractors to be registered in a given order. When it is passed
237 a URL, the file downloader handles it to the first InfoExtractor it
238 finds that reports being able to handle it. The InfoExtractor extracts
239 all the information about the video or videos the URL refers to, and
240 asks the FileDownloader to process the video information, possibly
241 downloading the video.
243 File downloaders accept a lot of parameters. In order not to saturate
244 the object constructor with arguments, it receives a dictionary of
245 options instead. These options are available through the params
246 attribute for the InfoExtractors to use. The FileDownloader also
247 registers itself as the downloader in charge for the InfoExtractors
248 that are added to it, so this is a "mutual registration".
252 username: Username for authentication purposes.
253 password: Password for authentication purposes.
254 usenetrc: Use netrc for authentication instead.
255 quiet: Do not print messages to stdout.
256 forceurl: Force printing final URL.
257 forcetitle: Force printing title.
258 forcethumbnail: Force printing thumbnail URL.
259 forcedescription: Force printing description.
260 simulate: Do not download the video files.
261 format: Video format code.
262 format_limit: Highest quality format to try.
263 outtmpl: Template for output names.
264 ignoreerrors: Do not stop on download errors.
265 ratelimit: Download speed limit, in bytes/sec.
266 nooverwrites: Prevent overwriting files.
267 retries: Number of times to retry for HTTP error 5xx
268 continuedl: Try to continue downloads if possible.
269 noprogress: Do not print the progress bar.
270 playliststart: Playlist item to start at.
271 playlistend: Playlist item to end at.
272 logtostderr: Log messages to stderr instead of stdout.
273 consoletitle: Display progress in console window's titlebar.
274 nopart: Do not use temporary .part files.
280 _download_retcode = None
281 _num_downloads = None
284 def __init__(self, params):
285 """Create a FileDownloader object with the given options."""
288 self._download_retcode = 0
289 self._num_downloads = 0
290 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
294 def pmkdir(filename):
295 """Create directory components in filename. Similar to Unix "mkdir -p"."""
296 components = filename.split(os.sep)
297 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
298 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
299 for dir in aggregate:
300 if not os.path.exists(dir):
304 def format_bytes(bytes):
307 if type(bytes) is str:
312 exponent = long(math.log(bytes, 1024.0))
313 suffix = 'bkMGTPEZY'[exponent]
314 converted = float(bytes) / float(1024**exponent)
315 return '%.2f%s' % (converted, suffix)
318 def calc_percent(byte_counter, data_len):
321 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
324 def calc_eta(start, now, total, current):
328 if current == 0 or dif < 0.001: # One millisecond
330 rate = float(current) / dif
331 eta = long((float(total) - float(current)) / rate)
332 (eta_mins, eta_secs) = divmod(eta, 60)
335 return '%02d:%02d' % (eta_mins, eta_secs)
338 def calc_speed(start, now, bytes):
340 if bytes == 0 or dif < 0.001: # One millisecond
341 return '%10s' % '---b/s'
342 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
345 def best_block_size(elapsed_time, bytes):
346 new_min = max(bytes / 2.0, 1.0)
347 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
348 if elapsed_time < 0.001:
350 rate = bytes / elapsed_time
358 def parse_bytes(bytestr):
359 """Parse a string indicating a byte quantity into a long integer."""
360 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
363 number = float(matchobj.group(1))
364 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
365 return long(round(number * multiplier))
367 def add_info_extractor(self, ie):
368 """Add an InfoExtractor object to the end of the list."""
370 ie.set_downloader(self)
372 def add_post_processor(self, pp):
373 """Add a PostProcessor object to the end of the chain."""
375 pp.set_downloader(self)
377 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
378 """Print message to stdout if not in quiet mode."""
380 if not self.params.get('quiet', False):
381 terminator = [u'\n', u''][skip_eol]
382 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
383 self._screen_file.flush()
384 except (UnicodeEncodeError), err:
385 if not ignore_encoding_errors:
388 def to_stderr(self, message):
389 """Print message to stderr."""
390 print >>sys.stderr, message.encode(preferredencoding())
392 def to_cons_title(self, message):
393 """Set console/terminal window title to message."""
394 if not self.params.get('consoletitle', False):
396 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
397 # c_wchar_p() might not be necessary if `message` is
398 # already of type unicode()
399 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
400 elif 'TERM' in os.environ:
401 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
403 def fixed_template(self):
404 """Checks if the output template is fixed."""
405 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
407 def trouble(self, message=None):
408 """Determine action to take when a download problem appears.
410 Depending on if the downloader has been configured to ignore
411 download errors or not, this method may throw an exception or
412 not when errors are found, after printing the message.
414 if message is not None:
415 self.to_stderr(message)
416 if not self.params.get('ignoreerrors', False):
417 raise DownloadError(message)
418 self._download_retcode = 1
420 def slow_down(self, start_time, byte_counter):
421 """Sleep if the download speed is over the rate limit."""
422 rate_limit = self.params.get('ratelimit', None)
423 if rate_limit is None or byte_counter == 0:
426 elapsed = now - start_time
429 speed = float(byte_counter) / elapsed
430 if speed > rate_limit:
431 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
433 def temp_name(self, filename):
434 """Returns a temporary filename for the given filename."""
435 if self.params.get('nopart', False) or filename == u'-' or \
436 (os.path.exists(filename) and not os.path.isfile(filename)):
438 return filename + u'.part'
440 def undo_temp_name(self, filename):
441 if filename.endswith(u'.part'):
442 return filename[:-len(u'.part')]
445 def try_rename(self, old_filename, new_filename):
447 if old_filename == new_filename:
449 os.rename(old_filename, new_filename)
450 except (IOError, OSError), err:
451 self.trouble(u'ERROR: unable to rename file')
453 def report_destination(self, filename):
454 """Report destination filename."""
455 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
457 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
458 """Report download progress."""
459 if self.params.get('noprogress', False):
461 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
462 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
463 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
464 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
466 def report_resuming_byte(self, resume_len):
467 """Report attempt to resume at given byte."""
468 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
470 def report_retry(self, count, retries):
471 """Report retry in case of HTTP error 5xx"""
472 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
474 def report_file_already_downloaded(self, file_name):
475 """Report file has already been fully downloaded."""
477 self.to_screen(u'[download] %s has already been downloaded' % file_name)
478 except (UnicodeEncodeError), err:
479 self.to_screen(u'[download] The file has already been downloaded')
481 def report_unable_to_resume(self):
482 """Report it was impossible to resume download."""
483 self.to_screen(u'[download] Unable to resume')
485 def report_finish(self):
486 """Report download finished."""
487 if self.params.get('noprogress', False):
488 self.to_screen(u'[download] Download completed')
492 def increment_downloads(self):
493 """Increment the ordinal that assigns a number to each file."""
494 self._num_downloads += 1
496 def process_info(self, info_dict):
497 """Process a single dictionary returned by an InfoExtractor."""
498 # Do nothing else if in simulate mode
499 if self.params.get('simulate', False):
501 if self.params.get('forcetitle', False):
502 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
503 if self.params.get('forceurl', False):
504 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
505 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
506 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
507 if self.params.get('forcedescription', False) and 'description' in info_dict:
508 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
513 template_dict = dict(info_dict)
514 template_dict['epoch'] = unicode(long(time.time()))
515 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
516 filename = self.params['outtmpl'] % template_dict
517 except (ValueError, KeyError), err:
518 self.trouble(u'ERROR: invalid system charset or erroneous output template')
520 if self.params.get('nooverwrites', False) and os.path.exists(filename):
521 self.to_stderr(u'WARNING: file exists and will be skipped')
525 self.pmkdir(filename)
526 except (OSError, IOError), err:
527 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
531 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
532 except (OSError, IOError), err:
533 raise UnavailableVideoError
534 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
535 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
537 except (ContentTooShortError, ), err:
538 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
543 self.post_process(filename, info_dict)
544 except (PostProcessingError), err:
545 self.trouble(u'ERROR: postprocessing: %s' % str(err))
548 def download(self, url_list):
549 """Download a given list of URLs."""
550 if len(url_list) > 1 and self.fixed_template():
551 raise SameFileError(self.params['outtmpl'])
554 suitable_found = False
556 # Go to next InfoExtractor if not suitable
557 if not ie.suitable(url):
560 # Suitable InfoExtractor found
561 suitable_found = True
563 # Extract information from URL and process it
566 # Suitable InfoExtractor had been found; go to next URL
569 if not suitable_found:
570 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
572 return self._download_retcode
574 def post_process(self, filename, ie_info):
575 """Run the postprocessing chain on the given file."""
577 info['filepath'] = filename
583 def _download_with_rtmpdump(self, filename, url, player_url):
584 self.report_destination(filename)
585 tmpfilename = self.temp_name(filename)
587 # Check for rtmpdump first
589 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
590 except (OSError, IOError):
591 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
594 # Download using rtmpdump. rtmpdump returns exit code 2 when
595 # the connection was interrumpted and resuming appears to be
596 # possible. This is part of rtmpdump's normal usage, AFAIK.
597 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
598 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
599 while retval == 2 or retval == 1:
600 prevsize = os.path.getsize(tmpfilename)
601 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
602 time.sleep(5.0) # This seems to be needed
603 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
604 cursize = os.path.getsize(tmpfilename)
605 if prevsize == cursize and retval == 1:
608 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
609 self.try_rename(tmpfilename, filename)
612 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
615 def _do_download(self, filename, url, player_url):
616 # Check file already present
617 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
618 self.report_file_already_downloaded(filename)
621 # Attempt to download using rtmpdump
622 if url.startswith('rtmp'):
623 return self._download_with_rtmpdump(filename, url, player_url)
625 tmpfilename = self.temp_name(filename)
629 # Do not include the Accept-Encoding header
630 headers = {'Youtubedl-no-compression': 'True'}
631 basic_request = urllib2.Request(url, None, headers)
632 request = urllib2.Request(url, None, headers)
634 # Establish possible resume length
635 if os.path.isfile(tmpfilename):
636 resume_len = os.path.getsize(tmpfilename)
640 # Request parameters in case of being able to resume
641 if self.params.get('continuedl', False) and resume_len != 0:
642 self.report_resuming_byte(resume_len)
643 request.add_header('Range','bytes=%d-' % resume_len)
647 retries = self.params.get('retries', 0)
648 while count <= retries:
649 # Establish connection
651 data = urllib2.urlopen(request)
653 except (urllib2.HTTPError, ), err:
654 if (err.code < 500 or err.code >= 600) and err.code != 416:
655 # Unexpected HTTP error
657 elif err.code == 416:
658 # Unable to resume (requested range not satisfiable)
660 # Open the connection again without the range header
661 data = urllib2.urlopen(basic_request)
662 content_length = data.info()['Content-Length']
663 except (urllib2.HTTPError, ), err:
664 if err.code < 500 or err.code >= 600:
667 # Examine the reported length
668 if (content_length is not None and
669 (resume_len - 100 < long(content_length) < resume_len + 100)):
670 # The file had already been fully downloaded.
671 # Explanation to the above condition: in issue #175 it was revealed that
672 # YouTube sometimes adds or removes a few bytes from the end of the file,
673 # changing the file size slightly and causing problems for some users. So
674 # I decided to implement a suggested change and consider the file
675 # completely downloaded if the file size differs less than 100 bytes from
676 # the one in the hard drive.
677 self.report_file_already_downloaded(filename)
678 self.try_rename(tmpfilename, filename)
681 # The length does not match, we start the download over
682 self.report_unable_to_resume()
688 self.report_retry(count, retries)
691 self.trouble(u'ERROR: giving up after %s retries' % retries)
694 data_len = data.info().get('Content-length', None)
695 if data_len is not None:
696 data_len = long(data_len) + resume_len
697 data_len_str = self.format_bytes(data_len)
698 byte_counter = 0 + resume_len
704 data_block = data.read(block_size)
706 if len(data_block) == 0:
708 byte_counter += len(data_block)
710 # Open file just in time
713 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
714 filename = self.undo_temp_name(tmpfilename)
715 self.report_destination(filename)
716 except (OSError, IOError), err:
717 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
720 stream.write(data_block)
721 except (IOError, OSError), err:
722 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
724 block_size = self.best_block_size(after - before, len(data_block))
727 percent_str = self.calc_percent(byte_counter, data_len)
728 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
729 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
730 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
733 self.slow_down(start, byte_counter - resume_len)
737 if data_len is not None and byte_counter != data_len:
738 raise ContentTooShortError(byte_counter, long(data_len))
739 self.try_rename(tmpfilename, filename)
742 class InfoExtractor(object):
743 """Information Extractor class.
745 Information extractors are the classes that, given a URL, extract
746 information from the video (or videos) the URL refers to. This
747 information includes the real video URL, the video title and simplified
748 title, author and others. The information is stored in a dictionary
749 which is then passed to the FileDownloader. The FileDownloader
750 processes this information possibly downloading the video to the file
751 system, among other possible outcomes. The dictionaries must include
752 the following fields:
754 id: Video identifier.
755 url: Final video URL.
756 uploader: Nickname of the video uploader.
757 title: Literal title.
758 stitle: Simplified title.
759 ext: Video filename extension.
760 format: Video format.
761 player_url: SWF Player URL (may be None).
763 The following fields are optional. Their primary purpose is to allow
764 youtube-dl to serve as the backend for a video search function, such
765 as the one in youtube2mp3. They are only used when their respective
766 forced printing functions are called:
768 thumbnail: Full URL to a video thumbnail image.
769 description: One-line video description.
771 Subclasses of this one should re-define the _real_initialize() and
772 _real_extract() methods, as well as the suitable() static method.
773 Probably, they should also be instantiated and added to the main
780 def __init__(self, downloader=None):
781 """Constructor. Receives an optional downloader."""
783 self.set_downloader(downloader)
787 """Receives a URL and returns True if suitable for this IE."""
790 def initialize(self):
791 """Initializes an instance (authentication, etc)."""
793 self._real_initialize()
796 def extract(self, url):
797 """Extracts URL information and returns it in list of dicts."""
799 return self._real_extract(url)
801 def set_downloader(self, downloader):
802 """Sets the downloader for this IE."""
803 self._downloader = downloader
805 def _real_initialize(self):
806 """Real initialization process. Redefine in subclasses."""
809 def _real_extract(self, url):
810 """Real extraction process. Redefine in subclasses."""
813 class YoutubeIE(InfoExtractor):
814 """Information extractor for youtube.com."""
816 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
817 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
818 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
819 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
820 _NETRC_MACHINE = 'youtube'
821 # Listed in order of quality
822 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
823 _video_extensions = {
829 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
836 return (re.match(YoutubeIE._VALID_URL, url) is not None)
838 def report_lang(self):
839 """Report attempt to set language."""
840 self._downloader.to_screen(u'[youtube] Setting language')
842 def report_login(self):
843 """Report attempt to log in."""
844 self._downloader.to_screen(u'[youtube] Logging in')
846 def report_age_confirmation(self):
847 """Report attempt to confirm age."""
848 self._downloader.to_screen(u'[youtube] Confirming age')
850 def report_video_webpage_download(self, video_id):
851 """Report attempt to download video webpage."""
852 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
854 def report_video_info_webpage_download(self, video_id):
855 """Report attempt to download video info webpage."""
856 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
858 def report_information_extraction(self, video_id):
859 """Report attempt to extract video information."""
860 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
862 def report_unavailable_format(self, video_id, format):
863 """Report extracted video URL."""
864 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
866 def report_rtmp_download(self):
867 """Indicate the download will use the RTMP protocol."""
868 self._downloader.to_screen(u'[youtube] RTMP download detected')
870 def _real_initialize(self):
871 if self._downloader is None:
876 downloader_params = self._downloader.params
878 # Attempt to use provided username and password or .netrc data
879 if downloader_params.get('username', None) is not None:
880 username = downloader_params['username']
881 password = downloader_params['password']
882 elif downloader_params.get('usenetrc', False):
884 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
889 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
890 except (IOError, netrc.NetrcParseError), err:
891 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
895 request = urllib2.Request(self._LANG_URL)
898 urllib2.urlopen(request).read()
899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
900 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
903 # No authentication to be performed
909 'current_form': 'loginForm',
911 'action_login': 'Log In',
912 'username': username,
913 'password': password,
915 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
918 login_results = urllib2.urlopen(request).read()
919 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
920 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
922 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
923 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
929 'action_confirm': 'Confirm',
931 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
933 self.report_age_confirmation()
934 age_results = urllib2.urlopen(request).read()
935 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
936 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
939 def _real_extract(self, url):
940 # Extract video id from URL
941 mobj = re.match(self._VALID_URL, url)
943 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
945 video_id = mobj.group(2)
948 self.report_video_webpage_download(video_id)
949 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
951 video_webpage = urllib2.urlopen(request).read()
952 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
953 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
956 # Attempt to extract SWF player URL
957 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
959 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
964 self.report_video_info_webpage_download(video_id)
965 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
966 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
967 % (video_id, el_type))
968 request = urllib2.Request(video_info_url)
970 video_info_webpage = urllib2.urlopen(request).read()
971 video_info = parse_qs(video_info_webpage)
972 if 'token' in video_info:
974 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
975 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
977 if 'token' not in video_info:
978 if 'reason' in video_info:
979 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
981 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
984 # Start extracting information
985 self.report_information_extraction(video_id)
988 if 'author' not in video_info:
989 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
991 video_uploader = urllib.unquote_plus(video_info['author'][0])
994 if 'title' not in video_info:
995 self._downloader.trouble(u'ERROR: unable to extract video title')
997 video_title = urllib.unquote_plus(video_info['title'][0])
998 video_title = video_title.decode('utf-8')
999 video_title = sanitize_title(video_title)
1002 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1003 simple_title = simple_title.strip(ur'_')
1006 if 'thumbnail_url' not in video_info:
1007 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1008 video_thumbnail = ''
1009 else: # don't panic if we can't find it
1010 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1014 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
1015 if mobj is not None:
1016 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1017 format_expressions = ['%d %B %Y', '%B %d %Y']
1018 for expression in format_expressions:
1020 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1025 video_description = 'No description available.'
1026 if self._downloader.params.get('forcedescription', False):
1027 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1028 if mobj is not None:
1029 video_description = mobj.group(1)
1032 video_token = urllib.unquote_plus(video_info['token'][0])
1034 # Decide which formats to download
1035 req_format = self._downloader.params.get('format', None)
1037 if 'fmt_url_map' in video_info:
1038 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
1039 format_limit = self._downloader.params.get('format_limit', None)
1040 if format_limit is not None and format_limit in self._available_formats:
1041 format_list = self._available_formats[self._available_formats.index(format_limit):]
1043 format_list = self._available_formats
1044 existing_formats = [x for x in format_list if x in url_map]
1045 if len(existing_formats) == 0:
1046 self._downloader.trouble(u'ERROR: no known formats available for video')
1048 if req_format is None:
1049 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1050 elif req_format == '-1':
1051 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1054 if req_format not in url_map:
1055 self._downloader.trouble(u'ERROR: requested format not available')
1057 video_url_list = [(req_format, url_map[req_format])] # Specific format
1059 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1060 self.report_rtmp_download()
1061 video_url_list = [(None, video_info['conn'][0])]
1064 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
1067 for format_param, video_real_url in video_url_list:
1068 # At this point we have a new video
1069 self._downloader.increment_downloads()
1072 video_extension = self._video_extensions.get(format_param, 'flv')
1074 # Find the video URL in fmt_url_map or conn paramters
1076 # Process video information
1077 self._downloader.process_info({
1078 'id': video_id.decode('utf-8'),
1079 'url': video_real_url.decode('utf-8'),
1080 'uploader': video_uploader.decode('utf-8'),
1081 'upload_date': upload_date,
1082 'title': video_title,
1083 'stitle': simple_title,
1084 'ext': video_extension.decode('utf-8'),
1085 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1086 'thumbnail': video_thumbnail.decode('utf-8'),
1087 'description': video_description.decode('utf-8'),
1088 'player_url': player_url,
1090 except UnavailableVideoError, err:
1091 self._downloader.trouble(u'\nERROR: unable to download video')
1094 class MetacafeIE(InfoExtractor):
1095 """Information Extractor for metacafe.com."""
1097 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1098 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1099 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1102 def __init__(self, youtube_ie, downloader=None):
1103 InfoExtractor.__init__(self, downloader)
1104 self._youtube_ie = youtube_ie
1108 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1110 def report_disclaimer(self):
1111 """Report disclaimer retrieval."""
1112 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1114 def report_age_confirmation(self):
1115 """Report attempt to confirm age."""
1116 self._downloader.to_screen(u'[metacafe] Confirming age')
1118 def report_download_webpage(self, video_id):
1119 """Report webpage download."""
1120 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1122 def report_extraction(self, video_id):
1123 """Report information extraction."""
1124 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1126 def _real_initialize(self):
1127 # Retrieve disclaimer
1128 request = urllib2.Request(self._DISCLAIMER)
1130 self.report_disclaimer()
1131 disclaimer = urllib2.urlopen(request).read()
1132 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1133 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1139 'submit': "Continue - I'm over 18",
1141 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1143 self.report_age_confirmation()
1144 disclaimer = urllib2.urlopen(request).read()
1145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1146 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1149 def _real_extract(self, url):
1150 # Extract id and simplified title from URL
1151 mobj = re.match(self._VALID_URL, url)
1153 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1156 video_id = mobj.group(1)
1158 # Check if video comes from YouTube
1159 mobj2 = re.match(r'^yt-(.*)$', video_id)
1160 if mobj2 is not None:
1161 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1164 # At this point we have a new video
1165 self._downloader.increment_downloads()
1167 simple_title = mobj.group(2).decode('utf-8')
1169 # Retrieve video webpage to extract further information
1170 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1172 self.report_download_webpage(video_id)
1173 webpage = urllib2.urlopen(request).read()
1174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1178 # Extract URL, uploader and title from webpage
1179 self.report_extraction(video_id)
1180 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1181 if mobj is not None:
1182 mediaURL = urllib.unquote(mobj.group(1))
1183 video_extension = mediaURL[-3:]
1185 # Extract gdaKey if available
1186 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1188 video_url = mediaURL
1190 gdaKey = mobj.group(1)
1191 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1193 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1195 self._downloader.trouble(u'ERROR: unable to extract media URL')
1197 vardict = parse_qs(mobj.group(1))
1198 if 'mediaData' not in vardict:
1199 self._downloader.trouble(u'ERROR: unable to extract media URL')
1201 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1203 self._downloader.trouble(u'ERROR: unable to extract media URL')
1205 mediaURL = mobj.group(1).replace('\\/', '/')
1206 video_extension = mediaURL[-3:]
1207 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1209 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1211 self._downloader.trouble(u'ERROR: unable to extract title')
1213 video_title = mobj.group(1).decode('utf-8')
1214 video_title = sanitize_title(video_title)
1216 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1218 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1220 video_uploader = mobj.group(1)
1223 # Process video information
1224 self._downloader.process_info({
1225 'id': video_id.decode('utf-8'),
1226 'url': video_url.decode('utf-8'),
1227 'uploader': video_uploader.decode('utf-8'),
1228 'upload_date': u'NA',
1229 'title': video_title,
1230 'stitle': simple_title,
1231 'ext': video_extension.decode('utf-8'),
1235 except UnavailableVideoError:
1236 self._downloader.trouble(u'\nERROR: unable to download video')
1239 class DailymotionIE(InfoExtractor):
1240 """Information Extractor for Dailymotion"""
1242 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1244 def __init__(self, downloader=None):
1245 InfoExtractor.__init__(self, downloader)
1249 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1251 def report_download_webpage(self, video_id):
1252 """Report webpage download."""
1253 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1255 def report_extraction(self, video_id):
1256 """Report information extraction."""
1257 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1259 def _real_initialize(self):
1262 def _real_extract(self, url):
1263 # Extract id and simplified title from URL
1264 mobj = re.match(self._VALID_URL, url)
1266 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1269 # At this point we have a new video
1270 self._downloader.increment_downloads()
1271 video_id = mobj.group(1)
1273 simple_title = mobj.group(2).decode('utf-8')
1274 video_extension = 'flv'
1276 # Retrieve video webpage to extract further information
1277 request = urllib2.Request(url)
1279 self.report_download_webpage(video_id)
1280 webpage = urllib2.urlopen(request).read()
1281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1282 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1285 # Extract URL, uploader and title from webpage
1286 self.report_extraction(video_id)
1287 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1289 self._downloader.trouble(u'ERROR: unable to extract media URL')
1291 mediaURL = urllib.unquote(mobj.group(1))
1293 # if needed add http://www.dailymotion.com/ if relative URL
1295 video_url = mediaURL
1297 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1298 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1300 self._downloader.trouble(u'ERROR: unable to extract title')
1302 video_title = mobj.group(1).decode('utf-8')
1303 video_title = sanitize_title(video_title)
1305 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1307 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1309 video_uploader = mobj.group(1)
1312 # Process video information
1313 self._downloader.process_info({
1314 'id': video_id.decode('utf-8'),
1315 'url': video_url.decode('utf-8'),
1316 'uploader': video_uploader.decode('utf-8'),
1317 'upload_date': u'NA',
1318 'title': video_title,
1319 'stitle': simple_title,
1320 'ext': video_extension.decode('utf-8'),
1324 except UnavailableVideoError:
1325 self._downloader.trouble(u'\nERROR: unable to download video')
1327 class GoogleIE(InfoExtractor):
1328 """Information extractor for video.google.com."""
1330 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1332 def __init__(self, downloader=None):
1333 InfoExtractor.__init__(self, downloader)
1337 return (re.match(GoogleIE._VALID_URL, url) is not None)
1339 def report_download_webpage(self, video_id):
1340 """Report webpage download."""
1341 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1343 def report_extraction(self, video_id):
1344 """Report information extraction."""
1345 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1347 def _real_initialize(self):
1350 def _real_extract(self, url):
1351 # Extract id from URL
1352 mobj = re.match(self._VALID_URL, url)
1354 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1357 # At this point we have a new video
1358 self._downloader.increment_downloads()
1359 video_id = mobj.group(1)
1361 video_extension = 'mp4'
1363 # Retrieve video webpage to extract further information
1364 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1366 self.report_download_webpage(video_id)
1367 webpage = urllib2.urlopen(request).read()
1368 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1369 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1372 # Extract URL, uploader, and title from webpage
1373 self.report_extraction(video_id)
1374 mobj = re.search(r"download_url:'([^']+)'", webpage)
1376 video_extension = 'flv'
1377 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1379 self._downloader.trouble(u'ERROR: unable to extract media URL')
1381 mediaURL = urllib.unquote(mobj.group(1))
1382 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1383 mediaURL = mediaURL.replace('\\x26', '\x26')
1385 video_url = mediaURL
1387 mobj = re.search(r'<title>(.*)</title>', webpage)
1389 self._downloader.trouble(u'ERROR: unable to extract title')
1391 video_title = mobj.group(1).decode('utf-8')
1392 video_title = sanitize_title(video_title)
1393 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1395 # Extract video description
1396 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1398 self._downloader.trouble(u'ERROR: unable to extract video description')
1400 video_description = mobj.group(1).decode('utf-8')
1401 if not video_description:
1402 video_description = 'No description available.'
1404 # Extract video thumbnail
1405 if self._downloader.params.get('forcethumbnail', False):
1406 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1408 webpage = urllib2.urlopen(request).read()
1409 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1410 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1412 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1414 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1416 video_thumbnail = mobj.group(1)
1417 else: # we need something to pass to process_info
1418 video_thumbnail = ''
1422 # Process video information
1423 self._downloader.process_info({
1424 'id': video_id.decode('utf-8'),
1425 'url': video_url.decode('utf-8'),
1427 'upload_date': u'NA',
1428 'title': video_title,
1429 'stitle': simple_title,
1430 'ext': video_extension.decode('utf-8'),
1434 except UnavailableVideoError:
1435 self._downloader.trouble(u'\nERROR: unable to download video')
1438 class PhotobucketIE(InfoExtractor):
1439 """Information extractor for photobucket.com."""
1441 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1443 def __init__(self, downloader=None):
1444 InfoExtractor.__init__(self, downloader)
1448 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1450 def report_download_webpage(self, video_id):
1451 """Report webpage download."""
1452 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1454 def report_extraction(self, video_id):
1455 """Report information extraction."""
1456 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1458 def _real_initialize(self):
1461 def _real_extract(self, url):
1462 # Extract id from URL
1463 mobj = re.match(self._VALID_URL, url)
1465 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1468 # At this point we have a new video
1469 self._downloader.increment_downloads()
1470 video_id = mobj.group(1)
1472 video_extension = 'flv'
1474 # Retrieve video webpage to extract further information
1475 request = urllib2.Request(url)
1477 self.report_download_webpage(video_id)
1478 webpage = urllib2.urlopen(request).read()
1479 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1480 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1483 # Extract URL, uploader, and title from webpage
1484 self.report_extraction(video_id)
1485 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1487 self._downloader.trouble(u'ERROR: unable to extract media URL')
1489 mediaURL = urllib.unquote(mobj.group(1))
1491 video_url = mediaURL
1493 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1495 self._downloader.trouble(u'ERROR: unable to extract title')
1497 video_title = mobj.group(1).decode('utf-8')
1498 video_title = sanitize_title(video_title)
1499 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1501 video_uploader = mobj.group(2).decode('utf-8')
1504 # Process video information
1505 self._downloader.process_info({
1506 'id': video_id.decode('utf-8'),
1507 'url': video_url.decode('utf-8'),
1508 'uploader': video_uploader,
1509 'upload_date': u'NA',
1510 'title': video_title,
1511 'stitle': simple_title,
1512 'ext': video_extension.decode('utf-8'),
1516 except UnavailableVideoError:
1517 self._downloader.trouble(u'\nERROR: unable to download video')
1520 class YahooIE(InfoExtractor):
1521 """Information extractor for video.yahoo.com."""
1523 # _VALID_URL matches all Yahoo! Video URLs
1524 # _VPAGE_URL matches only the extractable '/watch/' URLs
1525 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1526 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1528 def __init__(self, downloader=None):
1529 InfoExtractor.__init__(self, downloader)
1533 return (re.match(YahooIE._VALID_URL, url) is not None)
1535 def report_download_webpage(self, video_id):
1536 """Report webpage download."""
1537 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1539 def report_extraction(self, video_id):
1540 """Report information extraction."""
1541 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1543 def _real_initialize(self):
1546 def _real_extract(self, url, new_video=True):
1547 # Extract ID from URL
1548 mobj = re.match(self._VALID_URL, url)
1550 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1553 # At this point we have a new video
1554 self._downloader.increment_downloads()
1555 video_id = mobj.group(2)
1556 video_extension = 'flv'
1558 # Rewrite valid but non-extractable URLs as
1559 # extractable English language /watch/ URLs
1560 if re.match(self._VPAGE_URL, url) is None:
1561 request = urllib2.Request(url)
1563 webpage = urllib2.urlopen(request).read()
1564 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1565 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1568 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1570 self._downloader.trouble(u'ERROR: Unable to extract id field')
1572 yahoo_id = mobj.group(1)
1574 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1576 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1578 yahoo_vid = mobj.group(1)
1580 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1581 return self._real_extract(url, new_video=False)
1583 # Retrieve video webpage to extract further information
1584 request = urllib2.Request(url)
1586 self.report_download_webpage(video_id)
1587 webpage = urllib2.urlopen(request).read()
1588 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1589 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1592 # Extract uploader and title from webpage
1593 self.report_extraction(video_id)
1594 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1596 self._downloader.trouble(u'ERROR: unable to extract video title')
1598 video_title = mobj.group(1).decode('utf-8')
1599 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1601 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1603 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1605 video_uploader = mobj.group(1).decode('utf-8')
1607 # Extract video thumbnail
1608 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1610 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1612 video_thumbnail = mobj.group(1).decode('utf-8')
1614 # Extract video description
1615 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1617 self._downloader.trouble(u'ERROR: unable to extract video description')
1619 video_description = mobj.group(1).decode('utf-8')
1620 if not video_description: video_description = 'No description available.'
1622 # Extract video height and width
1623 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1625 self._downloader.trouble(u'ERROR: unable to extract video height')
1627 yv_video_height = mobj.group(1)
1629 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1631 self._downloader.trouble(u'ERROR: unable to extract video width')
1633 yv_video_width = mobj.group(1)
1635 # Retrieve video playlist to extract media URL
1636 # I'm not completely sure what all these options are, but we
1637 # seem to need most of them, otherwise the server sends a 401.
1638 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1639 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1640 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1641 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1642 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1644 self.report_download_webpage(video_id)
1645 webpage = urllib2.urlopen(request).read()
1646 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1647 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1650 # Extract media URL from playlist XML
1651 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1653 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1655 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1656 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1659 # Process video information
1660 self._downloader.process_info({
1661 'id': video_id.decode('utf-8'),
1663 'uploader': video_uploader,
1664 'upload_date': u'NA',
1665 'title': video_title,
1666 'stitle': simple_title,
1667 'ext': video_extension.decode('utf-8'),
1668 'thumbnail': video_thumbnail.decode('utf-8'),
1669 'description': video_description,
1670 'thumbnail': video_thumbnail,
1671 'description': video_description,
1674 except UnavailableVideoError:
1675 self._downloader.trouble(u'\nERROR: unable to download video')
1678 class GenericIE(InfoExtractor):
1679 """Generic last-resort information extractor."""
1681 def __init__(self, downloader=None):
1682 InfoExtractor.__init__(self, downloader)
1688 def report_download_webpage(self, video_id):
1689 """Report webpage download."""
1690 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1691 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1693 def report_extraction(self, video_id):
1694 """Report information extraction."""
1695 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1697 def _real_initialize(self):
1700 def _real_extract(self, url):
1701 # At this point we have a new video
1702 self._downloader.increment_downloads()
1704 video_id = url.split('/')[-1]
1705 request = urllib2.Request(url)
1707 self.report_download_webpage(video_id)
1708 webpage = urllib2.urlopen(request).read()
1709 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1710 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1712 except ValueError, err:
1713 # since this is the last-resort InfoExtractor, if
1714 # this error is thrown, it'll be thrown here
1715 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1718 self.report_extraction(video_id)
1719 # Start with something easy: JW Player in SWFObject
1720 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1722 # Broaden the search a little bit
1723 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1725 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1728 # It's possible that one of the regexes
1729 # matched, but returned an empty group:
1730 if mobj.group(1) is None:
1731 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1734 video_url = urllib.unquote(mobj.group(1))
1735 video_id = os.path.basename(video_url)
1737 # here's a fun little line of code for you:
1738 video_extension = os.path.splitext(video_id)[1][1:]
1739 video_id = os.path.splitext(video_id)[0]
1741 # it's tempting to parse this further, but you would
1742 # have to take into account all the variations like
1743 # Video Title - Site Name
1744 # Site Name | Video Title
1745 # Video Title - Tagline | Site Name
1746 # and so on and so forth; it's just not practical
1747 mobj = re.search(r'<title>(.*)</title>', webpage)
1749 self._downloader.trouble(u'ERROR: unable to extract title')
1751 video_title = mobj.group(1).decode('utf-8')
1752 video_title = sanitize_title(video_title)
1753 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1755 # video uploader is domain name
1756 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1758 self._downloader.trouble(u'ERROR: unable to extract title')
1760 video_uploader = mobj.group(1).decode('utf-8')
1763 # Process video information
1764 self._downloader.process_info({
1765 'id': video_id.decode('utf-8'),
1766 'url': video_url.decode('utf-8'),
1767 'uploader': video_uploader,
1768 'upload_date': u'NA',
1769 'title': video_title,
1770 'stitle': simple_title,
1771 'ext': video_extension.decode('utf-8'),
1775 except UnavailableVideoError, err:
1776 self._downloader.trouble(u'\nERROR: unable to download video')
1779 class YoutubeSearchIE(InfoExtractor):
1780 """Information Extractor for YouTube search queries."""
1781 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1782 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1783 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1784 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1786 _max_youtube_results = 1000
1788 def __init__(self, youtube_ie, downloader=None):
1789 InfoExtractor.__init__(self, downloader)
1790 self._youtube_ie = youtube_ie
1794 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1796 def report_download_page(self, query, pagenum):
1797 """Report attempt to download playlist page with given number."""
1798 query = query.decode(preferredencoding())
1799 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1801 def _real_initialize(self):
1802 self._youtube_ie.initialize()
1804 def _real_extract(self, query):
1805 mobj = re.match(self._VALID_QUERY, query)
1807 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1810 prefix, query = query.split(':')
1812 query = query.encode('utf-8')
1814 self._download_n_results(query, 1)
1816 elif prefix == 'all':
1817 self._download_n_results(query, self._max_youtube_results)
1823 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1825 elif n > self._max_youtube_results:
1826 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1827 n = self._max_youtube_results
1828 self._download_n_results(query, n)
1830 except ValueError: # parsing prefix as integer fails
1831 self._download_n_results(query, 1)
1834 def _download_n_results(self, query, n):
1835 """Downloads a specified number of results for a query"""
1838 already_seen = set()
1842 self.report_download_page(query, pagenum)
1843 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1844 request = urllib2.Request(result_url)
1846 page = urllib2.urlopen(request).read()
1847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1848 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1851 # Extract video identifiers
1852 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1853 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1854 if video_id not in already_seen:
1855 video_ids.append(video_id)
1856 already_seen.add(video_id)
1857 if len(video_ids) == n:
1858 # Specified n videos reached
1859 for id in video_ids:
1860 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1863 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1864 for id in video_ids:
1865 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1868 pagenum = pagenum + 1
1870 class GoogleSearchIE(InfoExtractor):
1871 """Information Extractor for Google Video search queries."""
1872 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1873 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1874 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1875 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1877 _max_google_results = 1000
1879 def __init__(self, google_ie, downloader=None):
1880 InfoExtractor.__init__(self, downloader)
1881 self._google_ie = google_ie
1885 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1887 def report_download_page(self, query, pagenum):
1888 """Report attempt to download playlist page with given number."""
1889 query = query.decode(preferredencoding())
1890 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1892 def _real_initialize(self):
1893 self._google_ie.initialize()
1895 def _real_extract(self, query):
1896 mobj = re.match(self._VALID_QUERY, query)
1898 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1901 prefix, query = query.split(':')
1903 query = query.encode('utf-8')
1905 self._download_n_results(query, 1)
1907 elif prefix == 'all':
1908 self._download_n_results(query, self._max_google_results)
1914 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1916 elif n > self._max_google_results:
1917 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1918 n = self._max_google_results
1919 self._download_n_results(query, n)
1921 except ValueError: # parsing prefix as integer fails
1922 self._download_n_results(query, 1)
1925 def _download_n_results(self, query, n):
1926 """Downloads a specified number of results for a query"""
1929 already_seen = set()
1933 self.report_download_page(query, pagenum)
1934 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1935 request = urllib2.Request(result_url)
1937 page = urllib2.urlopen(request).read()
1938 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1939 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1942 # Extract video identifiers
1943 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1944 video_id = mobj.group(1)
1945 if video_id not in already_seen:
1946 video_ids.append(video_id)
1947 already_seen.add(video_id)
1948 if len(video_ids) == n:
1949 # Specified n videos reached
1950 for id in video_ids:
1951 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1954 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1955 for id in video_ids:
1956 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1959 pagenum = pagenum + 1
1961 class YahooSearchIE(InfoExtractor):
1962 """Information Extractor for Yahoo! Video search queries."""
1963 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1964 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1965 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1966 _MORE_PAGES_INDICATOR = r'\s*Next'
1968 _max_yahoo_results = 1000
1970 def __init__(self, yahoo_ie, downloader=None):
1971 InfoExtractor.__init__(self, downloader)
1972 self._yahoo_ie = yahoo_ie
1976 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1978 def report_download_page(self, query, pagenum):
1979 """Report attempt to download playlist page with given number."""
1980 query = query.decode(preferredencoding())
1981 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1983 def _real_initialize(self):
1984 self._yahoo_ie.initialize()
1986 def _real_extract(self, query):
1987 mobj = re.match(self._VALID_QUERY, query)
1989 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1992 prefix, query = query.split(':')
1994 query = query.encode('utf-8')
1996 self._download_n_results(query, 1)
1998 elif prefix == 'all':
1999 self._download_n_results(query, self._max_yahoo_results)
2005 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2007 elif n > self._max_yahoo_results:
2008 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2009 n = self._max_yahoo_results
2010 self._download_n_results(query, n)
2012 except ValueError: # parsing prefix as integer fails
2013 self._download_n_results(query, 1)
2016 def _download_n_results(self, query, n):
2017 """Downloads a specified number of results for a query"""
2020 already_seen = set()
2024 self.report_download_page(query, pagenum)
2025 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2026 request = urllib2.Request(result_url)
2028 page = urllib2.urlopen(request).read()
2029 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2030 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2033 # Extract video identifiers
2034 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2035 video_id = mobj.group(1)
2036 if video_id not in already_seen:
2037 video_ids.append(video_id)
2038 already_seen.add(video_id)
2039 if len(video_ids) == n:
2040 # Specified n videos reached
2041 for id in video_ids:
2042 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2045 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2046 for id in video_ids:
2047 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2050 pagenum = pagenum + 1
2052 class YoutubePlaylistIE(InfoExtractor):
2053 """Information Extractor for YouTube playlists."""
2055 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/|p/)([^&]+).*'
2056 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
2057 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2058 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2061 def __init__(self, youtube_ie, downloader=None):
2062 InfoExtractor.__init__(self, downloader)
2063 self._youtube_ie = youtube_ie
2067 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2069 def report_download_page(self, playlist_id, pagenum):
2070 """Report attempt to download playlist page with given number."""
2071 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2073 def _real_initialize(self):
2074 self._youtube_ie.initialize()
2076 def _real_extract(self, url):
2077 # Extract playlist id
2078 mobj = re.match(self._VALID_URL, url)
2080 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2083 # Download playlist pages
2084 playlist_id = mobj.group(1)
2089 self.report_download_page(playlist_id, pagenum)
2090 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum))
2092 page = urllib2.urlopen(request).read()
2093 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2094 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2097 # Extract video identifiers
2099 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2100 if mobj.group(1) not in ids_in_page:
2101 ids_in_page.append(mobj.group(1))
2102 video_ids.extend(ids_in_page)
2104 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2106 pagenum = pagenum + 1
2108 playliststart = self._downloader.params.get('playliststart', 1) - 1
2109 playlistend = self._downloader.params.get('playlistend', -1)
2110 video_ids = video_ids[playliststart:playlistend]
2112 for id in video_ids:
2113 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2116 class YoutubeUserIE(InfoExtractor):
2117 """Information Extractor for YouTube users."""
2119 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2120 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2121 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2124 def __init__(self, youtube_ie, downloader=None):
2125 InfoExtractor.__init__(self, downloader)
2126 self._youtube_ie = youtube_ie
2130 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2132 def report_download_page(self, username):
2133 """Report attempt to download user page."""
2134 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2136 def _real_initialize(self):
2137 self._youtube_ie.initialize()
2139 def _real_extract(self, url):
2141 mobj = re.match(self._VALID_URL, url)
2143 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2146 # Download user page
2147 username = mobj.group(1)
2151 self.report_download_page(username)
2152 request = urllib2.Request(self._TEMPLATE_URL % (username))
2154 page = urllib2.urlopen(request).read()
2155 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2156 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2159 # Extract video identifiers
2162 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2163 if mobj.group(1) not in ids_in_page:
2164 ids_in_page.append(mobj.group(1))
2165 video_ids.extend(ids_in_page)
2167 playliststart = self._downloader.params.get('playliststart', 1) - 1
2168 playlistend = self._downloader.params.get('playlistend', -1)
2169 video_ids = video_ids[playliststart:playlistend]
2171 for id in video_ids:
2172 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2175 class DepositFilesIE(InfoExtractor):
2176 """Information extractor for depositfiles.com"""
2178 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2180 def __init__(self, downloader=None):
2181 InfoExtractor.__init__(self, downloader)
2185 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2187 def report_download_webpage(self, file_id):
2188 """Report webpage download."""
2189 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2191 def report_extraction(self, file_id):
2192 """Report information extraction."""
2193 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2195 def _real_initialize(self):
2198 def _real_extract(self, url):
2199 # At this point we have a new file
2200 self._downloader.increment_downloads()
2202 file_id = url.split('/')[-1]
2203 # Rebuild url in english locale
2204 url = 'http://depositfiles.com/en/files/' + file_id
2206 # Retrieve file webpage with 'Free download' button pressed
2207 free_download_indication = { 'gateway_result' : '1' }
2208 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2210 self.report_download_webpage(file_id)
2211 webpage = urllib2.urlopen(request).read()
2212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2213 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2216 # Search for the real file URL
2217 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2218 if (mobj is None) or (mobj.group(1) is None):
2219 # Try to figure out reason of the error.
2220 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2221 if (mobj is not None) and (mobj.group(1) is not None):
2222 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2223 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2225 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2228 file_url = mobj.group(1)
2229 file_extension = os.path.splitext(file_url)[1][1:]
2231 # Search for file title
2232 mobj = re.search(r'<b title="(.*?)">', webpage)
2234 self._downloader.trouble(u'ERROR: unable to extract title')
2236 file_title = mobj.group(1).decode('utf-8')
2239 # Process file information
2240 self._downloader.process_info({
2241 'id': file_id.decode('utf-8'),
2242 'url': file_url.decode('utf-8'),
2244 'upload_date': u'NA',
2245 'title': file_title,
2246 'stitle': file_title,
2247 'ext': file_extension.decode('utf-8'),
2251 except UnavailableVideoError, err:
2252 self._downloader.trouble(u'ERROR: unable to download file')
2254 class PostProcessor(object):
2255 """Post Processor class.
2257 PostProcessor objects can be added to downloaders with their
2258 add_post_processor() method. When the downloader has finished a
2259 successful download, it will take its internal chain of PostProcessors
2260 and start calling the run() method on each one of them, first with
2261 an initial argument and then with the returned value of the previous
2264 The chain will be stopped if one of them ever returns None or the end
2265 of the chain is reached.
2267 PostProcessor objects follow a "mutual registration" process similar
2268 to InfoExtractor objects.
2273 def __init__(self, downloader=None):
2274 self._downloader = downloader
2276 def set_downloader(self, downloader):
2277 """Sets the downloader for this PP."""
2278 self._downloader = downloader
2280 def run(self, information):
2281 """Run the PostProcessor.
2283 The "information" argument is a dictionary like the ones
2284 composed by InfoExtractors. The only difference is that this
2285 one has an extra field called "filepath" that points to the
2288 When this method returns None, the postprocessing chain is
2289 stopped. However, this method may return an information
2290 dictionary that will be passed to the next postprocessing
2291 object in the chain. It can be the one it received after
2292 changing some fields.
2294 In addition, this method may raise a PostProcessingError
2295 exception that will be taken into account by the downloader
2298 return information # by default, do nothing
2300 ### MAIN PROGRAM ###
2301 if __name__ == '__main__':
2303 # Modules needed only when running the main program
2307 # Function to update the program file with the latest version from the repository.
2308 def update_self(downloader, filename):
2309 # Note: downloader only used for options
2310 if not os.access(filename, os.W_OK):
2311 sys.exit('ERROR: no write permissions on %s' % filename)
2313 downloader.to_screen('Updating to latest stable version...')
2315 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2316 latest_version = urllib.urlopen(latest_url).read().strip()
2317 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2318 newcontent = urllib.urlopen(prog_url).read()
2319 except (IOError, OSError), err:
2320 sys.exit('ERROR: unable to download latest version')
2322 stream = open(filename, 'w')
2323 stream.write(newcontent)
2325 except (IOError, OSError), err:
2326 sys.exit('ERROR: unable to overwrite current version')
2327 downloader.to_screen('Updated to version %s' % latest_version)
2329 # Parse command line
2330 parser = optparse.OptionParser(
2331 usage='Usage: %prog [options] url...',
2332 version='2010.12.09',
2333 conflict_handler='resolve',
2336 parser.add_option('-h', '--help',
2337 action='help', help='print this help text and exit')
2338 parser.add_option('-v', '--version',
2339 action='version', help='print program version and exit')
2340 parser.add_option('-U', '--update',
2341 action='store_true', dest='update_self', help='update this program to latest stable version')
2342 parser.add_option('-i', '--ignore-errors',
2343 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2344 parser.add_option('-r', '--rate-limit',
2345 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2346 parser.add_option('-R', '--retries',
2347 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2348 parser.add_option('--playlist-start',
2349 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2350 parser.add_option('--playlist-end',
2351 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2352 parser.add_option('--dump-user-agent',
2353 action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False)
2355 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2356 authentication.add_option('-u', '--username',
2357 dest='username', metavar='USERNAME', help='account username')
2358 authentication.add_option('-p', '--password',
2359 dest='password', metavar='PASSWORD', help='account password')
2360 authentication.add_option('-n', '--netrc',
2361 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2362 parser.add_option_group(authentication)
2364 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2365 video_format.add_option('-f', '--format',
2366 action='store', dest='format', metavar='FORMAT', help='video format code')
2367 video_format.add_option('--all-formats',
2368 action='store_const', dest='format', help='download all available video formats', const='-1')
2369 video_format.add_option('--max-quality',
2370 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2371 parser.add_option_group(video_format)
2373 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2374 verbosity.add_option('-q', '--quiet',
2375 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2376 verbosity.add_option('-s', '--simulate',
2377 action='store_true', dest='simulate', help='do not download video', default=False)
2378 verbosity.add_option('-g', '--get-url',
2379 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2380 verbosity.add_option('-e', '--get-title',
2381 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2382 verbosity.add_option('--get-thumbnail',
2383 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2384 verbosity.add_option('--get-description',
2385 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2386 verbosity.add_option('--no-progress',
2387 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2388 verbosity.add_option('--console-title',
2389 action='store_true', dest='consoletitle', help='display progress in console titlebar', default=False)
2390 parser.add_option_group(verbosity)
2392 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2393 filesystem.add_option('-t', '--title',
2394 action='store_true', dest='usetitle', help='use title in file name', default=False)
2395 filesystem.add_option('-l', '--literal',
2396 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2397 filesystem.add_option('-A', '--auto-number',
2398 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2399 filesystem.add_option('-o', '--output',
2400 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2401 filesystem.add_option('-a', '--batch-file',
2402 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2403 filesystem.add_option('-w', '--no-overwrites',
2404 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2405 filesystem.add_option('-c', '--continue',
2406 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2407 filesystem.add_option('--cookies',
2408 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2409 filesystem.add_option('--no-part',
2410 action='store_true', dest='nopart', help='do not use .part files', default=False)
2411 parser.add_option_group(filesystem)
2413 (opts, args) = parser.parse_args()
2415 # Open appropriate CookieJar
2416 if opts.cookiefile is None:
2417 jar = cookielib.CookieJar()
2420 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2421 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2423 except (IOError, OSError), err:
2424 sys.exit(u'ERROR: unable to open cookie file')
2427 if opts.dump_user_agent:
2428 print std_headers['User-Agent']
2431 # General configuration
2432 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2433 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()))
2434 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2436 # Batch file verification
2438 if opts.batchfile is not None:
2440 if opts.batchfile == '-':
2443 batchfd = open(opts.batchfile, 'r')
2444 batchurls = batchfd.readlines()
2445 batchurls = [x.strip() for x in batchurls]
2446 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2448 sys.exit(u'ERROR: batch file could not be read')
2449 all_urls = batchurls + args
2451 # Conflicting, missing and erroneous options
2452 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2453 parser.error(u'using .netrc conflicts with giving username/password')
2454 if opts.password is not None and opts.username is None:
2455 parser.error(u'account username missing')
2456 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2457 parser.error(u'using output template conflicts with using title, literal title or auto number')
2458 if opts.usetitle and opts.useliteral:
2459 parser.error(u'using title conflicts with using literal title')
2460 if opts.username is not None and opts.password is None:
2461 opts.password = getpass.getpass(u'Type account password and press return:')
2462 if opts.ratelimit is not None:
2463 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2464 if numeric_limit is None:
2465 parser.error(u'invalid rate limit specified')
2466 opts.ratelimit = numeric_limit
2467 if opts.retries is not None:
2469 opts.retries = long(opts.retries)
2470 except (TypeError, ValueError), err:
2471 parser.error(u'invalid retry count specified')
2473 opts.playliststart = long(opts.playliststart)
2474 if opts.playliststart <= 0:
2476 except (TypeError, ValueError), err:
2477 parser.error(u'invalid playlist start number specified')
2479 opts.playlistend = long(opts.playlistend)
2480 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2482 except (TypeError, ValueError), err:
2483 parser.error(u'invalid playlist end number specified')
2485 # Information extractors
2486 youtube_ie = YoutubeIE()
2487 metacafe_ie = MetacafeIE(youtube_ie)
2488 dailymotion_ie = DailymotionIE()
2489 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2490 youtube_user_ie = YoutubeUserIE(youtube_ie)
2491 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2492 google_ie = GoogleIE()
2493 google_search_ie = GoogleSearchIE(google_ie)
2494 photobucket_ie = PhotobucketIE()
2495 yahoo_ie = YahooIE()
2496 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2497 deposit_files_ie = DepositFilesIE()
2498 generic_ie = GenericIE()
2501 fd = FileDownloader({
2502 'usenetrc': opts.usenetrc,
2503 'username': opts.username,
2504 'password': opts.password,
2505 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2506 'forceurl': opts.geturl,
2507 'forcetitle': opts.gettitle,
2508 'forcethumbnail': opts.getthumbnail,
2509 'forcedescription': opts.getdescription,
2510 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2511 'format': opts.format,
2512 'format_limit': opts.format_limit,
2513 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2514 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2515 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2516 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2517 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2518 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2519 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2520 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2521 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2522 or u'%(id)s.%(ext)s'),
2523 'ignoreerrors': opts.ignoreerrors,
2524 'ratelimit': opts.ratelimit,
2525 'nooverwrites': opts.nooverwrites,
2526 'retries': opts.retries,
2527 'continuedl': opts.continue_dl,
2528 'noprogress': opts.noprogress,
2529 'playliststart': opts.playliststart,
2530 'playlistend': opts.playlistend,
2531 'logtostderr': opts.outtmpl == '-',
2532 'consoletitle': opts.consoletitle,
2533 'nopart': opts.nopart,
2535 fd.add_info_extractor(youtube_search_ie)
2536 fd.add_info_extractor(youtube_pl_ie)
2537 fd.add_info_extractor(youtube_user_ie)
2538 fd.add_info_extractor(metacafe_ie)
2539 fd.add_info_extractor(dailymotion_ie)
2540 fd.add_info_extractor(youtube_ie)
2541 fd.add_info_extractor(google_ie)
2542 fd.add_info_extractor(google_search_ie)
2543 fd.add_info_extractor(photobucket_ie)
2544 fd.add_info_extractor(yahoo_ie)
2545 fd.add_info_extractor(yahoo_search_ie)
2546 fd.add_info_extractor(deposit_files_ie)
2548 # This must come last since it's the
2549 # fallback if none of the others work
2550 fd.add_info_extractor(generic_ie)
2553 if opts.update_self:
2554 update_self(fd, sys.argv[0])
2557 if len(all_urls) < 1:
2558 if not opts.update_self:
2559 parser.error(u'you must provide at least one URL')
2562 retcode = fd.download(all_urls)
2564 # Dump cookie jar if requested
2565 if opts.cookiefile is not None:
2568 except (IOError, OSError), err:
2569 sys.exit(u'ERROR: unable to save cookie jar')
2573 except DownloadError:
2575 except SameFileError:
2576 sys.exit(u'ERROR: fixed output name but more than one file to download')
2577 except KeyboardInterrupt:
2578 sys.exit(u'\nERROR: Interrupted by user')