2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.05'
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
52 except ImportError: # Python 2.4
55 import cStringIO as StringIO
59 # parse_qs was moved from the cgi module to the urlparse module recently.
61 from urlparse import parse_qs
63 from cgi import parse_qs
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
98 raiseError('Premature end', i)
100 def decodeEscape(match):
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
128 while s[e-bslashes-1] == '\\':
130 if bslashes % 2 == 1:
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
141 if s[i] == '}': # Empty dictionary
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
156 raiseError('Expected comma or closing curly brace', i)
161 if s[i] == ']': # Empty array
166 i = skipSpace(i) # Raise exception if premature end
170 raiseError('Expected a comma or closing bracket', i)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
176 raiseError('Not a boolean (or null)', i)
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
180 raiseError('Not a number', i)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
196 def preferredencoding():
197 """Get preferred encoding.
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
202 def yield_preferredencoding():
204 pref = locale.getpreferredencoding()
210 return yield_preferredencoding().next()
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
219 entity = matchobj.group(1)
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
231 numstr = u'0%s' % numstr
234 return unichr(long(numstr, base))
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
254 It returns the tuple (stream, definitive_file_name).
258 if sys.platform == 'win32':
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
293 def _unescapeHTML(s):
295 @param s a string (of type unicode)
297 assert type(s) == type(u'')
299 htmlParser = HTMLParser.HTMLParser()
300 return htmlParser.unescape(s)
302 class DownloadError(Exception):
303 """Download Error exception.
305 This exception may be thrown by FileDownloader objects if they are not
306 configured to continue on errors. They will contain the appropriate
312 class SameFileError(Exception):
313 """Same File exception.
315 This exception will be thrown by FileDownloader objects if they detect
316 multiple files would have to be downloaded to the same file on disk.
321 class PostProcessingError(Exception):
322 """Post Processing exception.
324 This exception may be raised by PostProcessor's .run() method to
325 indicate an error in the postprocessing task.
329 class MaxDownloadsReached(Exception):
330 """ --max-downloads limit has been reached. """
334 class UnavailableVideoError(Exception):
335 """Unavailable Format exception.
337 This exception will be thrown when a video is requested
338 in a format that is not available for that video.
343 class ContentTooShortError(Exception):
344 """Content Too Short exception.
346 This exception may be raised by FileDownloader objects when a file they
347 download is too small for what the server announced first, indicating
348 the connection was probably interrupted.
354 def __init__(self, downloaded, expected):
355 self.downloaded = downloaded
356 self.expected = expected
359 class YoutubeDLHandler(urllib2.HTTPHandler):
360 """Handler for HTTP requests and responses.
362 This class, when installed with an OpenerDirector, automatically adds
363 the standard headers to every HTTP request and handles gzipped and
364 deflated responses from web servers. If compression is to be avoided in
365 a particular request, the original request in the program code only has
366 to include the HTTP header "Youtubedl-No-Compression", which will be
367 removed before making the real request.
369 Part of this code was copied from:
371 http://techknack.net/python-urllib2-handlers/
373 Andrew Rowls, the author of that code, agreed to release it to the
380 return zlib.decompress(data, -zlib.MAX_WBITS)
382 return zlib.decompress(data)
385 def addinfourl_wrapper(stream, headers, url, code):
386 if hasattr(urllib2.addinfourl, 'getcode'):
387 return urllib2.addinfourl(stream, headers, url, code)
388 ret = urllib2.addinfourl(stream, headers, url)
392 def http_request(self, req):
393 for h in std_headers:
396 req.add_header(h, std_headers[h])
397 if 'Youtubedl-no-compression' in req.headers:
398 if 'Accept-encoding' in req.headers:
399 del req.headers['Accept-encoding']
400 del req.headers['Youtubedl-no-compression']
403 def http_response(self, req, resp):
406 if resp.headers.get('Content-encoding', '') == 'gzip':
407 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
408 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
409 resp.msg = old_resp.msg
411 if resp.headers.get('Content-encoding', '') == 'deflate':
412 gz = StringIO.StringIO(self.deflate(resp.read()))
413 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
414 resp.msg = old_resp.msg
418 class FileDownloader(object):
419 """File Downloader class.
421 File downloader objects are the ones responsible of downloading the
422 actual video file and writing it to disk if the user has requested
423 it, among some other tasks. In most cases there should be one per
424 program. As, given a video URL, the downloader doesn't know how to
425 extract all the needed information, task that InfoExtractors do, it
426 has to pass the URL to one of them.
428 For this, file downloader objects have a method that allows
429 InfoExtractors to be registered in a given order. When it is passed
430 a URL, the file downloader handles it to the first InfoExtractor it
431 finds that reports being able to handle it. The InfoExtractor extracts
432 all the information about the video or videos the URL refers to, and
433 asks the FileDownloader to process the video information, possibly
434 downloading the video.
436 File downloaders accept a lot of parameters. In order not to saturate
437 the object constructor with arguments, it receives a dictionary of
438 options instead. These options are available through the params
439 attribute for the InfoExtractors to use. The FileDownloader also
440 registers itself as the downloader in charge for the InfoExtractors
441 that are added to it, so this is a "mutual registration".
445 username: Username for authentication purposes.
446 password: Password for authentication purposes.
447 usenetrc: Use netrc for authentication instead.
448 quiet: Do not print messages to stdout.
449 forceurl: Force printing final URL.
450 forcetitle: Force printing title.
451 forcethumbnail: Force printing thumbnail URL.
452 forcedescription: Force printing description.
453 forcefilename: Force printing final filename.
454 simulate: Do not download the video files.
455 format: Video format code.
456 format_limit: Highest quality format to try.
457 outtmpl: Template for output names.
458 ignoreerrors: Do not stop on download errors.
459 ratelimit: Download speed limit, in bytes/sec.
460 nooverwrites: Prevent overwriting files.
461 retries: Number of times to retry for HTTP error 5xx
462 continuedl: Try to continue downloads if possible.
463 noprogress: Do not print the progress bar.
464 playliststart: Playlist item to start at.
465 playlistend: Playlist item to end at.
466 matchtitle: Download only matching titles.
467 rejecttitle: Reject downloads for matching titles.
468 logtostderr: Log messages to stderr instead of stdout.
469 consoletitle: Display progress in console window's titlebar.
470 nopart: Do not use temporary .part files.
471 updatetime: Use the Last-modified header to set output file timestamps.
472 writedescription: Write the video description to a .description file
473 writeinfojson: Write the video description to a .info.json file
479 _download_retcode = None
480 _num_downloads = None
483 def __init__(self, params):
484 """Create a FileDownloader object with the given options."""
487 self._download_retcode = 0
488 self._num_downloads = 0
489 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
493 def format_bytes(bytes):
496 if type(bytes) is str:
501 exponent = long(math.log(bytes, 1024.0))
502 suffix = 'bkMGTPEZY'[exponent]
503 converted = float(bytes) / float(1024 ** exponent)
504 return '%.2f%s' % (converted, suffix)
507 def calc_percent(byte_counter, data_len):
510 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
513 def calc_eta(start, now, total, current):
517 if current == 0 or dif < 0.001: # One millisecond
519 rate = float(current) / dif
520 eta = long((float(total) - float(current)) / rate)
521 (eta_mins, eta_secs) = divmod(eta, 60)
524 return '%02d:%02d' % (eta_mins, eta_secs)
527 def calc_speed(start, now, bytes):
529 if bytes == 0 or dif < 0.001: # One millisecond
530 return '%10s' % '---b/s'
531 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
534 def best_block_size(elapsed_time, bytes):
535 new_min = max(bytes / 2.0, 1.0)
536 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
537 if elapsed_time < 0.001:
539 rate = bytes / elapsed_time
547 def parse_bytes(bytestr):
548 """Parse a string indicating a byte quantity into a long integer."""
549 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
552 number = float(matchobj.group(1))
553 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
554 return long(round(number * multiplier))
556 def add_info_extractor(self, ie):
557 """Add an InfoExtractor object to the end of the list."""
559 ie.set_downloader(self)
561 def add_post_processor(self, pp):
562 """Add a PostProcessor object to the end of the chain."""
564 pp.set_downloader(self)
566 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
567 """Print message to stdout if not in quiet mode."""
569 if not self.params.get('quiet', False):
570 terminator = [u'\n', u''][skip_eol]
571 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
572 self._screen_file.flush()
573 except (UnicodeEncodeError), err:
574 if not ignore_encoding_errors:
577 def to_stderr(self, message):
578 """Print message to stderr."""
579 print >>sys.stderr, message.encode(preferredencoding())
581 def to_cons_title(self, message):
582 """Set console/terminal window title to message."""
583 if not self.params.get('consoletitle', False):
585 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
586 # c_wchar_p() might not be necessary if `message` is
587 # already of type unicode()
588 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
589 elif 'TERM' in os.environ:
590 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
592 def fixed_template(self):
593 """Checks if the output template is fixed."""
594 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
596 def trouble(self, message=None):
597 """Determine action to take when a download problem appears.
599 Depending on if the downloader has been configured to ignore
600 download errors or not, this method may throw an exception or
601 not when errors are found, after printing the message.
603 if message is not None:
604 self.to_stderr(message)
605 if not self.params.get('ignoreerrors', False):
606 raise DownloadError(message)
607 self._download_retcode = 1
609 def slow_down(self, start_time, byte_counter):
610 """Sleep if the download speed is over the rate limit."""
611 rate_limit = self.params.get('ratelimit', None)
612 if rate_limit is None or byte_counter == 0:
615 elapsed = now - start_time
618 speed = float(byte_counter) / elapsed
619 if speed > rate_limit:
620 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
622 def temp_name(self, filename):
623 """Returns a temporary filename for the given filename."""
624 if self.params.get('nopart', False) or filename == u'-' or \
625 (os.path.exists(filename) and not os.path.isfile(filename)):
627 return filename + u'.part'
629 def undo_temp_name(self, filename):
630 if filename.endswith(u'.part'):
631 return filename[:-len(u'.part')]
634 def try_rename(self, old_filename, new_filename):
636 if old_filename == new_filename:
638 os.rename(old_filename, new_filename)
639 except (IOError, OSError), err:
640 self.trouble(u'ERROR: unable to rename file')
642 def try_utime(self, filename, last_modified_hdr):
643 """Try to set the last-modified time of the given file."""
644 if last_modified_hdr is None:
646 if not os.path.isfile(filename):
648 timestr = last_modified_hdr
651 filetime = timeconvert(timestr)
655 os.utime(filename, (time.time(), filetime))
660 def report_writedescription(self, descfn):
661 """ Report that the description file is being written """
662 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
664 def report_writeinfojson(self, infofn):
665 """ Report that the metadata file has been written """
666 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
668 def report_destination(self, filename):
669 """Report destination filename."""
670 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
672 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
673 """Report download progress."""
674 if self.params.get('noprogress', False):
676 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
677 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
678 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
679 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
681 def report_resuming_byte(self, resume_len):
682 """Report attempt to resume at given byte."""
683 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
685 def report_retry(self, count, retries):
686 """Report retry in case of HTTP error 5xx"""
687 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
689 def report_file_already_downloaded(self, file_name):
690 """Report file has already been fully downloaded."""
692 self.to_screen(u'[download] %s has already been downloaded' % file_name)
693 except (UnicodeEncodeError), err:
694 self.to_screen(u'[download] The file has already been downloaded')
696 def report_unable_to_resume(self):
697 """Report it was impossible to resume download."""
698 self.to_screen(u'[download] Unable to resume')
700 def report_finish(self):
701 """Report download finished."""
702 if self.params.get('noprogress', False):
703 self.to_screen(u'[download] Download completed')
707 def increment_downloads(self):
708 """Increment the ordinal that assigns a number to each file."""
709 self._num_downloads += 1
711 def prepare_filename(self, info_dict):
712 """Generate the output filename."""
714 template_dict = dict(info_dict)
715 template_dict['epoch'] = unicode(long(time.time()))
716 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
717 filename = self.params['outtmpl'] % template_dict
719 except (ValueError, KeyError), err:
720 self.trouble(u'ERROR: invalid system charset or erroneous output template')
723 def _match_entry(self, info_dict):
724 """ Returns None iff the file should be downloaded """
726 title = info_dict['title']
727 matchtitle = self.params.get('matchtitle', False)
728 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
729 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
730 rejecttitle = self.params.get('rejecttitle', False)
731 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
732 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
735 def process_info(self, info_dict):
736 """Process a single dictionary returned by an InfoExtractor."""
738 reason = self._match_entry(info_dict)
739 if reason is not None:
740 self.to_screen(u'[download] ' + reason)
743 max_downloads = self.params.get('max_downloads')
744 if max_downloads is not None:
745 if self._num_downloads > int(max_downloads):
746 raise MaxDownloadsReached()
748 filename = self.prepare_filename(info_dict)
751 if self.params.get('forcetitle', False):
752 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
753 if self.params.get('forceurl', False):
754 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
755 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
756 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
757 if self.params.get('forcedescription', False) and 'description' in info_dict:
758 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
759 if self.params.get('forcefilename', False) and filename is not None:
760 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
761 if self.params.get('forceformat', False):
762 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
764 # Do nothing else if in simulate mode
765 if self.params.get('simulate', False):
772 dn = os.path.dirname(filename)
773 if dn != '' and not os.path.exists(dn):
775 except (OSError, IOError), err:
776 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
779 if self.params.get('writedescription', False):
781 descfn = filename + '.description'
782 self.report_writedescription(descfn)
783 descfile = open(descfn, 'wb')
785 descfile.write(info_dict['description'].encode('utf-8'))
788 except (OSError, IOError):
789 self.trouble(u'ERROR: Cannot write description file ' + descfn)
792 if self.params.get('writeinfojson', False):
793 infofn = filename + '.info.json'
794 self.report_writeinfojson(infofn)
797 except (NameError,AttributeError):
798 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
801 infof = open(infofn, 'wb')
803 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
804 json.dump(json_info_dict, infof)
807 except (OSError, IOError):
808 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
811 if not self.params.get('skip_download', False):
812 if self.params.get('nooverwrites', False) and os.path.exists(filename):
816 success = self._do_download(filename, info_dict)
817 except (OSError, IOError), err:
818 raise UnavailableVideoError
819 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
820 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
822 except (ContentTooShortError, ), err:
823 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
828 self.post_process(filename, info_dict)
829 except (PostProcessingError), err:
830 self.trouble(u'ERROR: postprocessing: %s' % str(err))
833 def download(self, url_list):
834 """Download a given list of URLs."""
835 if len(url_list) > 1 and self.fixed_template():
836 raise SameFileError(self.params['outtmpl'])
839 suitable_found = False
841 # Go to next InfoExtractor if not suitable
842 if not ie.suitable(url):
845 # Suitable InfoExtractor found
846 suitable_found = True
848 # Extract information from URL and process it
851 # Suitable InfoExtractor had been found; go to next URL
854 if not suitable_found:
855 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
857 return self._download_retcode
859 def post_process(self, filename, ie_info):
860 """Run the postprocessing chain on the given file."""
862 info['filepath'] = filename
868 def _download_with_rtmpdump(self, filename, url, player_url):
869 self.report_destination(filename)
870 tmpfilename = self.temp_name(filename)
872 # Check for rtmpdump first
874 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
875 except (OSError, IOError):
876 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
879 # Download using rtmpdump. rtmpdump returns exit code 2 when
880 # the connection was interrumpted and resuming appears to be
881 # possible. This is part of rtmpdump's normal usage, AFAIK.
882 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
883 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
884 while retval == 2 or retval == 1:
885 prevsize = os.path.getsize(tmpfilename)
886 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
887 time.sleep(5.0) # This seems to be needed
888 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
889 cursize = os.path.getsize(tmpfilename)
890 if prevsize == cursize and retval == 1:
892 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
893 if prevsize == cursize and retval == 2 and cursize > 1024:
894 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
898 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
899 self.try_rename(tmpfilename, filename)
902 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
905 def _do_download(self, filename, info_dict):
906 url = info_dict['url']
907 player_url = info_dict.get('player_url', None)
909 # Check file already present
910 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
911 self.report_file_already_downloaded(filename)
914 # Attempt to download using rtmpdump
915 if url.startswith('rtmp'):
916 return self._download_with_rtmpdump(filename, url, player_url)
918 tmpfilename = self.temp_name(filename)
921 # Do not include the Accept-Encoding header
922 headers = {'Youtubedl-no-compression': 'True'}
923 basic_request = urllib2.Request(url, None, headers)
924 request = urllib2.Request(url, None, headers)
926 # Establish possible resume length
927 if os.path.isfile(tmpfilename):
928 resume_len = os.path.getsize(tmpfilename)
934 if self.params.get('continuedl', False):
935 self.report_resuming_byte(resume_len)
936 request.add_header('Range','bytes=%d-' % resume_len)
942 retries = self.params.get('retries', 0)
943 while count <= retries:
944 # Establish connection
946 if count == 0 and 'urlhandle' in info_dict:
947 data = info_dict['urlhandle']
948 data = urllib2.urlopen(request)
950 except (urllib2.HTTPError, ), err:
951 if (err.code < 500 or err.code >= 600) and err.code != 416:
952 # Unexpected HTTP error
954 elif err.code == 416:
955 # Unable to resume (requested range not satisfiable)
957 # Open the connection again without the range header
958 data = urllib2.urlopen(basic_request)
959 content_length = data.info()['Content-Length']
960 except (urllib2.HTTPError, ), err:
961 if err.code < 500 or err.code >= 600:
964 # Examine the reported length
965 if (content_length is not None and
966 (resume_len - 100 < long(content_length) < resume_len + 100)):
967 # The file had already been fully downloaded.
968 # Explanation to the above condition: in issue #175 it was revealed that
969 # YouTube sometimes adds or removes a few bytes from the end of the file,
970 # changing the file size slightly and causing problems for some users. So
971 # I decided to implement a suggested change and consider the file
972 # completely downloaded if the file size differs less than 100 bytes from
973 # the one in the hard drive.
974 self.report_file_already_downloaded(filename)
975 self.try_rename(tmpfilename, filename)
978 # The length does not match, we start the download over
979 self.report_unable_to_resume()
985 self.report_retry(count, retries)
988 self.trouble(u'ERROR: giving up after %s retries' % retries)
991 data_len = data.info().get('Content-length', None)
992 if data_len is not None:
993 data_len = long(data_len) + resume_len
994 data_len_str = self.format_bytes(data_len)
995 byte_counter = 0 + resume_len
1000 before = time.time()
1001 data_block = data.read(block_size)
1003 if len(data_block) == 0:
1005 byte_counter += len(data_block)
1007 # Open file just in time
1010 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1011 assert stream is not None
1012 filename = self.undo_temp_name(tmpfilename)
1013 self.report_destination(filename)
1014 except (OSError, IOError), err:
1015 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1018 stream.write(data_block)
1019 except (IOError, OSError), err:
1020 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1022 block_size = self.best_block_size(after - before, len(data_block))
1025 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1026 if data_len is None:
1027 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1029 percent_str = self.calc_percent(byte_counter, data_len)
1030 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1031 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1034 self.slow_down(start, byte_counter - resume_len)
1037 self.trouble(u'\nERROR: Did not get any data blocks')
1040 self.report_finish()
1041 if data_len is not None and byte_counter != data_len:
1042 raise ContentTooShortError(byte_counter, long(data_len))
1043 self.try_rename(tmpfilename, filename)
1045 # Update file modification time
1046 if self.params.get('updatetime', True):
1047 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1052 class InfoExtractor(object):
1053 """Information Extractor class.
1055 Information extractors are the classes that, given a URL, extract
1056 information from the video (or videos) the URL refers to. This
1057 information includes the real video URL, the video title and simplified
1058 title, author and others. The information is stored in a dictionary
1059 which is then passed to the FileDownloader. The FileDownloader
1060 processes this information possibly downloading the video to the file
1061 system, among other possible outcomes. The dictionaries must include
1062 the following fields:
1064 id: Video identifier.
1065 url: Final video URL.
1066 uploader: Nickname of the video uploader.
1067 title: Literal title.
1068 stitle: Simplified title.
1069 ext: Video filename extension.
1070 format: Video format.
1071 player_url: SWF Player URL (may be None).
1073 The following fields are optional. Their primary purpose is to allow
1074 youtube-dl to serve as the backend for a video search function, such
1075 as the one in youtube2mp3. They are only used when their respective
1076 forced printing functions are called:
1078 thumbnail: Full URL to a video thumbnail image.
1079 description: One-line video description.
1081 Subclasses of this one should re-define the _real_initialize() and
1082 _real_extract() methods and define a _VALID_URL regexp.
1083 Probably, they should also be added to the list of extractors.
1089 def __init__(self, downloader=None):
1090 """Constructor. Receives an optional downloader."""
1092 self.set_downloader(downloader)
1094 def suitable(self, url):
1095 """Receives a URL and returns True if suitable for this IE."""
1096 return re.match(self._VALID_URL, url) is not None
1098 def initialize(self):
1099 """Initializes an instance (authentication, etc)."""
1101 self._real_initialize()
1104 def extract(self, url):
1105 """Extracts URL information and returns it in list of dicts."""
1107 return self._real_extract(url)
1109 def set_downloader(self, downloader):
1110 """Sets the downloader for this IE."""
1111 self._downloader = downloader
1113 def _real_initialize(self):
1114 """Real initialization process. Redefine in subclasses."""
1117 def _real_extract(self, url):
1118 """Real extraction process. Redefine in subclasses."""
1122 class YoutubeIE(InfoExtractor):
1123 """Information extractor for youtube.com."""
1125 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1126 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1127 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1128 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1129 _NETRC_MACHINE = 'youtube'
1130 # Listed in order of quality
1131 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1132 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1133 _video_extensions = {
1139 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1144 _video_dimensions = {
1159 IE_NAME = u'youtube'
1161 def report_lang(self):
1162 """Report attempt to set language."""
1163 self._downloader.to_screen(u'[youtube] Setting language')
1165 def report_login(self):
1166 """Report attempt to log in."""
1167 self._downloader.to_screen(u'[youtube] Logging in')
1169 def report_age_confirmation(self):
1170 """Report attempt to confirm age."""
1171 self._downloader.to_screen(u'[youtube] Confirming age')
1173 def report_video_webpage_download(self, video_id):
1174 """Report attempt to download video webpage."""
1175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1177 def report_video_info_webpage_download(self, video_id):
1178 """Report attempt to download video info webpage."""
1179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1181 def report_information_extraction(self, video_id):
1182 """Report attempt to extract video information."""
1183 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1185 def report_unavailable_format(self, video_id, format):
1186 """Report extracted video URL."""
1187 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1189 def report_rtmp_download(self):
1190 """Indicate the download will use the RTMP protocol."""
1191 self._downloader.to_screen(u'[youtube] RTMP download detected')
1193 def _print_formats(self, formats):
1194 print 'Available formats:'
1196 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1198 def _real_initialize(self):
1199 if self._downloader is None:
1204 downloader_params = self._downloader.params
1206 # Attempt to use provided username and password or .netrc data
1207 if downloader_params.get('username', None) is not None:
1208 username = downloader_params['username']
1209 password = downloader_params['password']
1210 elif downloader_params.get('usenetrc', False):
1212 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1213 if info is not None:
1217 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1218 except (IOError, netrc.NetrcParseError), err:
1219 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1223 request = urllib2.Request(self._LANG_URL)
1226 urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1231 # No authentication to be performed
1232 if username is None:
1237 'current_form': 'loginForm',
1239 'action_login': 'Log In',
1240 'username': username,
1241 'password': password,
1243 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1246 login_results = urllib2.urlopen(request).read()
1247 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1248 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1251 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1257 'action_confirm': 'Confirm',
1259 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1261 self.report_age_confirmation()
1262 age_results = urllib2.urlopen(request).read()
1263 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1264 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1267 def _real_extract(self, url):
1268 # Extract video id from URL
1269 mobj = re.match(self._VALID_URL, url)
1271 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1273 video_id = mobj.group(2)
1276 self.report_video_webpage_download(video_id)
1277 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1279 video_webpage = urllib2.urlopen(request).read()
1280 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1281 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1284 # Attempt to extract SWF player URL
1285 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1286 if mobj is not None:
1287 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1292 self.report_video_info_webpage_download(video_id)
1293 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1294 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1295 % (video_id, el_type))
1296 request = urllib2.Request(video_info_url)
1298 video_info_webpage = urllib2.urlopen(request).read()
1299 video_info = parse_qs(video_info_webpage)
1300 if 'token' in video_info:
1302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1305 if 'token' not in video_info:
1306 if 'reason' in video_info:
1307 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1309 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1312 # Start extracting information
1313 self.report_information_extraction(video_id)
1316 if 'author' not in video_info:
1317 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1319 video_uploader = urllib.unquote_plus(video_info['author'][0])
1322 if 'title' not in video_info:
1323 self._downloader.trouble(u'ERROR: unable to extract video title')
1325 video_title = urllib.unquote_plus(video_info['title'][0])
1326 video_title = video_title.decode('utf-8')
1327 video_title = sanitize_title(video_title)
1330 simple_title = _simplify_title(video_title)
1333 if 'thumbnail_url' not in video_info:
1334 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1335 video_thumbnail = ''
1336 else: # don't panic if we can't find it
1337 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1341 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1342 if mobj is not None:
1343 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1344 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1345 for expression in format_expressions:
1347 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1355 video_description = u'No description available.'
1356 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1357 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1358 if mobj is not None:
1359 video_description = mobj.group(1).decode('utf-8')
1361 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1362 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1363 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1364 # TODO use another parser
1367 video_token = urllib.unquote_plus(video_info['token'][0])
1369 # Decide which formats to download
1370 req_format = self._downloader.params.get('format', None)
1372 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1373 self.report_rtmp_download()
1374 video_url_list = [(None, video_info['conn'][0])]
1375 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1376 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1377 url_data = [parse_qs(uds) for uds in url_data_strs]
1378 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1379 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1381 format_limit = self._downloader.params.get('format_limit', None)
1382 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1383 if format_limit is not None and format_limit in available_formats:
1384 format_list = available_formats[available_formats.index(format_limit):]
1386 format_list = available_formats
1387 existing_formats = [x for x in format_list if x in url_map]
1388 if len(existing_formats) == 0:
1389 self._downloader.trouble(u'ERROR: no known formats available for video')
1391 if self._downloader.params.get('listformats', None):
1392 self._print_formats(existing_formats)
1394 if req_format is None or req_format == 'best':
1395 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1396 elif req_format == 'worst':
1397 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1398 elif req_format in ('-1', 'all'):
1399 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1401 # Specific formats. We pick the first in a slash-delimeted sequence.
1402 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1403 req_formats = req_format.split('/')
1404 video_url_list = None
1405 for rf in req_formats:
1407 video_url_list = [(rf, url_map[rf])]
1409 if video_url_list is None:
1410 self._downloader.trouble(u'ERROR: requested format not available')
1413 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1416 for format_param, video_real_url in video_url_list:
1417 # At this point we have a new video
1418 self._downloader.increment_downloads()
1421 video_extension = self._video_extensions.get(format_param, 'flv')
1424 # Process video information
1425 self._downloader.process_info({
1426 'id': video_id.decode('utf-8'),
1427 'url': video_real_url.decode('utf-8'),
1428 'uploader': video_uploader.decode('utf-8'),
1429 'upload_date': upload_date,
1430 'title': video_title,
1431 'stitle': simple_title,
1432 'ext': video_extension.decode('utf-8'),
1433 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1434 'thumbnail': video_thumbnail.decode('utf-8'),
1435 'description': video_description,
1436 'player_url': player_url,
1438 except UnavailableVideoError, err:
1439 self._downloader.trouble(u'\nERROR: unable to download video')
1442 class MetacafeIE(InfoExtractor):
1443 """Information Extractor for metacafe.com."""
1445 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1446 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1447 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1449 IE_NAME = u'metacafe'
1451 def __init__(self, youtube_ie, downloader=None):
1452 InfoExtractor.__init__(self, downloader)
1453 self._youtube_ie = youtube_ie
1455 def report_disclaimer(self):
1456 """Report disclaimer retrieval."""
1457 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1459 def report_age_confirmation(self):
1460 """Report attempt to confirm age."""
1461 self._downloader.to_screen(u'[metacafe] Confirming age')
1463 def report_download_webpage(self, video_id):
1464 """Report webpage download."""
1465 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1467 def report_extraction(self, video_id):
1468 """Report information extraction."""
1469 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1471 def _real_initialize(self):
1472 # Retrieve disclaimer
1473 request = urllib2.Request(self._DISCLAIMER)
1475 self.report_disclaimer()
1476 disclaimer = urllib2.urlopen(request).read()
1477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1484 'submit': "Continue - I'm over 18",
1486 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1488 self.report_age_confirmation()
1489 disclaimer = urllib2.urlopen(request).read()
1490 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1494 def _real_extract(self, url):
1495 # Extract id and simplified title from URL
1496 mobj = re.match(self._VALID_URL, url)
1498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1501 video_id = mobj.group(1)
1503 # Check if video comes from YouTube
1504 mobj2 = re.match(r'^yt-(.*)$', video_id)
1505 if mobj2 is not None:
1506 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1509 # At this point we have a new video
1510 self._downloader.increment_downloads()
1512 simple_title = mobj.group(2).decode('utf-8')
1514 # Retrieve video webpage to extract further information
1515 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1517 self.report_download_webpage(video_id)
1518 webpage = urllib2.urlopen(request).read()
1519 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1520 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1523 # Extract URL, uploader and title from webpage
1524 self.report_extraction(video_id)
1525 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1526 if mobj is not None:
1527 mediaURL = urllib.unquote(mobj.group(1))
1528 video_extension = mediaURL[-3:]
1530 # Extract gdaKey if available
1531 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1533 video_url = mediaURL
1535 gdaKey = mobj.group(1)
1536 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1538 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1540 self._downloader.trouble(u'ERROR: unable to extract media URL')
1542 vardict = parse_qs(mobj.group(1))
1543 if 'mediaData' not in vardict:
1544 self._downloader.trouble(u'ERROR: unable to extract media URL')
1546 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1548 self._downloader.trouble(u'ERROR: unable to extract media URL')
1550 mediaURL = mobj.group(1).replace('\\/', '/')
1551 video_extension = mediaURL[-3:]
1552 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1554 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1556 self._downloader.trouble(u'ERROR: unable to extract title')
1558 video_title = mobj.group(1).decode('utf-8')
1559 video_title = sanitize_title(video_title)
1561 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1563 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1565 video_uploader = mobj.group(1)
1568 # Process video information
1569 self._downloader.process_info({
1570 'id': video_id.decode('utf-8'),
1571 'url': video_url.decode('utf-8'),
1572 'uploader': video_uploader.decode('utf-8'),
1573 'upload_date': u'NA',
1574 'title': video_title,
1575 'stitle': simple_title,
1576 'ext': video_extension.decode('utf-8'),
1580 except UnavailableVideoError:
1581 self._downloader.trouble(u'\nERROR: unable to download video')
1584 class DailymotionIE(InfoExtractor):
1585 """Information Extractor for Dailymotion"""
1587 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1588 IE_NAME = u'dailymotion'
1590 def __init__(self, downloader=None):
1591 InfoExtractor.__init__(self, downloader)
1593 def report_download_webpage(self, video_id):
1594 """Report webpage download."""
1595 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1597 def report_extraction(self, video_id):
1598 """Report information extraction."""
1599 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1601 def _real_extract(self, url):
1602 # Extract id and simplified title from URL
1603 mobj = re.match(self._VALID_URL, url)
1605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1608 # At this point we have a new video
1609 self._downloader.increment_downloads()
1610 video_id = mobj.group(1)
1612 video_extension = 'flv'
1614 # Retrieve video webpage to extract further information
1615 request = urllib2.Request(url)
1616 request.add_header('Cookie', 'family_filter=off')
1618 self.report_download_webpage(video_id)
1619 webpage = urllib2.urlopen(request).read()
1620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1621 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1624 # Extract URL, uploader and title from webpage
1625 self.report_extraction(video_id)
1626 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1628 self._downloader.trouble(u'ERROR: unable to extract media URL')
1630 sequence = urllib.unquote(mobj.group(1))
1631 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1633 self._downloader.trouble(u'ERROR: unable to extract media URL')
1635 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1637 # if needed add http://www.dailymotion.com/ if relative URL
1639 video_url = mediaURL
1641 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1643 self._downloader.trouble(u'ERROR: unable to extract title')
1645 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1646 video_title = sanitize_title(video_title)
1647 simple_title = _simplify_title(video_title)
1649 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1651 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1653 video_uploader = mobj.group(1)
1656 # Process video information
1657 self._downloader.process_info({
1658 'id': video_id.decode('utf-8'),
1659 'url': video_url.decode('utf-8'),
1660 'uploader': video_uploader.decode('utf-8'),
1661 'upload_date': u'NA',
1662 'title': video_title,
1663 'stitle': simple_title,
1664 'ext': video_extension.decode('utf-8'),
1668 except UnavailableVideoError:
1669 self._downloader.trouble(u'\nERROR: unable to download video')
1672 class GoogleIE(InfoExtractor):
1673 """Information extractor for video.google.com."""
1675 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1676 IE_NAME = u'video.google'
1678 def __init__(self, downloader=None):
1679 InfoExtractor.__init__(self, downloader)
1681 def report_download_webpage(self, video_id):
1682 """Report webpage download."""
1683 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1685 def report_extraction(self, video_id):
1686 """Report information extraction."""
1687 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1689 def _real_extract(self, url):
1690 # Extract id from URL
1691 mobj = re.match(self._VALID_URL, url)
1693 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1696 # At this point we have a new video
1697 self._downloader.increment_downloads()
1698 video_id = mobj.group(1)
1700 video_extension = 'mp4'
1702 # Retrieve video webpage to extract further information
1703 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1705 self.report_download_webpage(video_id)
1706 webpage = urllib2.urlopen(request).read()
1707 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1708 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1711 # Extract URL, uploader, and title from webpage
1712 self.report_extraction(video_id)
1713 mobj = re.search(r"download_url:'([^']+)'", webpage)
1715 video_extension = 'flv'
1716 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1718 self._downloader.trouble(u'ERROR: unable to extract media URL')
1720 mediaURL = urllib.unquote(mobj.group(1))
1721 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1722 mediaURL = mediaURL.replace('\\x26', '\x26')
1724 video_url = mediaURL
1726 mobj = re.search(r'<title>(.*)</title>', webpage)
1728 self._downloader.trouble(u'ERROR: unable to extract title')
1730 video_title = mobj.group(1).decode('utf-8')
1731 video_title = sanitize_title(video_title)
1732 simple_title = _simplify_title(video_title)
1734 # Extract video description
1735 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1737 self._downloader.trouble(u'ERROR: unable to extract video description')
1739 video_description = mobj.group(1).decode('utf-8')
1740 if not video_description:
1741 video_description = 'No description available.'
1743 # Extract video thumbnail
1744 if self._downloader.params.get('forcethumbnail', False):
1745 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1747 webpage = urllib2.urlopen(request).read()
1748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1751 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1753 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1755 video_thumbnail = mobj.group(1)
1756 else: # we need something to pass to process_info
1757 video_thumbnail = ''
1760 # Process video information
1761 self._downloader.process_info({
1762 'id': video_id.decode('utf-8'),
1763 'url': video_url.decode('utf-8'),
1765 'upload_date': u'NA',
1766 'title': video_title,
1767 'stitle': simple_title,
1768 'ext': video_extension.decode('utf-8'),
1772 except UnavailableVideoError:
1773 self._downloader.trouble(u'\nERROR: unable to download video')
1776 class PhotobucketIE(InfoExtractor):
1777 """Information extractor for photobucket.com."""
1779 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1780 IE_NAME = u'photobucket'
1782 def __init__(self, downloader=None):
1783 InfoExtractor.__init__(self, downloader)
1785 def report_download_webpage(self, video_id):
1786 """Report webpage download."""
1787 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1789 def report_extraction(self, video_id):
1790 """Report information extraction."""
1791 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1793 def _real_extract(self, url):
1794 # Extract id from URL
1795 mobj = re.match(self._VALID_URL, url)
1797 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1800 # At this point we have a new video
1801 self._downloader.increment_downloads()
1802 video_id = mobj.group(1)
1804 video_extension = 'flv'
1806 # Retrieve video webpage to extract further information
1807 request = urllib2.Request(url)
1809 self.report_download_webpage(video_id)
1810 webpage = urllib2.urlopen(request).read()
1811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1812 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1815 # Extract URL, uploader, and title from webpage
1816 self.report_extraction(video_id)
1817 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1819 self._downloader.trouble(u'ERROR: unable to extract media URL')
1821 mediaURL = urllib.unquote(mobj.group(1))
1823 video_url = mediaURL
1825 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1827 self._downloader.trouble(u'ERROR: unable to extract title')
1829 video_title = mobj.group(1).decode('utf-8')
1830 video_title = sanitize_title(video_title)
1831 simple_title = _simplify_title(vide_title)
1833 video_uploader = mobj.group(2).decode('utf-8')
1836 # Process video information
1837 self._downloader.process_info({
1838 'id': video_id.decode('utf-8'),
1839 'url': video_url.decode('utf-8'),
1840 'uploader': video_uploader,
1841 'upload_date': u'NA',
1842 'title': video_title,
1843 'stitle': simple_title,
1844 'ext': video_extension.decode('utf-8'),
1848 except UnavailableVideoError:
1849 self._downloader.trouble(u'\nERROR: unable to download video')
1852 class YahooIE(InfoExtractor):
1853 """Information extractor for video.yahoo.com."""
1855 # _VALID_URL matches all Yahoo! Video URLs
1856 # _VPAGE_URL matches only the extractable '/watch/' URLs
1857 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1858 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1859 IE_NAME = u'video.yahoo'
1861 def __init__(self, downloader=None):
1862 InfoExtractor.__init__(self, downloader)
1864 def report_download_webpage(self, video_id):
1865 """Report webpage download."""
1866 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1868 def report_extraction(self, video_id):
1869 """Report information extraction."""
1870 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1872 def _real_extract(self, url, new_video=True):
1873 # Extract ID from URL
1874 mobj = re.match(self._VALID_URL, url)
1876 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1879 # At this point we have a new video
1880 self._downloader.increment_downloads()
1881 video_id = mobj.group(2)
1882 video_extension = 'flv'
1884 # Rewrite valid but non-extractable URLs as
1885 # extractable English language /watch/ URLs
1886 if re.match(self._VPAGE_URL, url) is None:
1887 request = urllib2.Request(url)
1889 webpage = urllib2.urlopen(request).read()
1890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1894 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1896 self._downloader.trouble(u'ERROR: Unable to extract id field')
1898 yahoo_id = mobj.group(1)
1900 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1902 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1904 yahoo_vid = mobj.group(1)
1906 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1907 return self._real_extract(url, new_video=False)
1909 # Retrieve video webpage to extract further information
1910 request = urllib2.Request(url)
1912 self.report_download_webpage(video_id)
1913 webpage = urllib2.urlopen(request).read()
1914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1918 # Extract uploader and title from webpage
1919 self.report_extraction(video_id)
1920 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1922 self._downloader.trouble(u'ERROR: unable to extract video title')
1924 video_title = mobj.group(1).decode('utf-8')
1925 simple_title = _simplify_title(video_title)
1927 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1929 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1931 video_uploader = mobj.group(1).decode('utf-8')
1933 # Extract video thumbnail
1934 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1936 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1938 video_thumbnail = mobj.group(1).decode('utf-8')
1940 # Extract video description
1941 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1943 self._downloader.trouble(u'ERROR: unable to extract video description')
1945 video_description = mobj.group(1).decode('utf-8')
1946 if not video_description:
1947 video_description = 'No description available.'
1949 # Extract video height and width
1950 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1952 self._downloader.trouble(u'ERROR: unable to extract video height')
1954 yv_video_height = mobj.group(1)
1956 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1958 self._downloader.trouble(u'ERROR: unable to extract video width')
1960 yv_video_width = mobj.group(1)
1962 # Retrieve video playlist to extract media URL
1963 # I'm not completely sure what all these options are, but we
1964 # seem to need most of them, otherwise the server sends a 401.
1965 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1966 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1967 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1968 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1969 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1971 self.report_download_webpage(video_id)
1972 webpage = urllib2.urlopen(request).read()
1973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1974 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1977 # Extract media URL from playlist XML
1978 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1980 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1982 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1983 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1986 # Process video information
1987 self._downloader.process_info({
1988 'id': video_id.decode('utf-8'),
1990 'uploader': video_uploader,
1991 'upload_date': u'NA',
1992 'title': video_title,
1993 'stitle': simple_title,
1994 'ext': video_extension.decode('utf-8'),
1995 'thumbnail': video_thumbnail.decode('utf-8'),
1996 'description': video_description,
1997 'thumbnail': video_thumbnail,
2000 except UnavailableVideoError:
2001 self._downloader.trouble(u'\nERROR: unable to download video')
2004 class VimeoIE(InfoExtractor):
2005 """Information extractor for vimeo.com."""
2007 # _VALID_URL matches Vimeo URLs
2008 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2011 def __init__(self, downloader=None):
2012 InfoExtractor.__init__(self, downloader)
2014 def report_download_webpage(self, video_id):
2015 """Report webpage download."""
2016 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2018 def report_extraction(self, video_id):
2019 """Report information extraction."""
2020 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2022 def _real_extract(self, url, new_video=True):
2023 # Extract ID from URL
2024 mobj = re.match(self._VALID_URL, url)
2026 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2029 # At this point we have a new video
2030 self._downloader.increment_downloads()
2031 video_id = mobj.group(1)
2033 # Retrieve video webpage to extract further information
2034 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2036 self.report_download_webpage(video_id)
2037 webpage = urllib2.urlopen(request).read()
2038 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2039 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2042 # Now we begin extracting as much information as we can from what we
2043 # retrieved. First we extract the information common to all extractors,
2044 # and latter we extract those that are Vimeo specific.
2045 self.report_extraction(video_id)
2048 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2050 self._downloader.trouble(u'ERROR: unable to extract video title')
2052 video_title = mobj.group(1).decode('utf-8')
2053 simple_title = _simplify_title(video_title)
2056 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2058 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2060 video_uploader = mobj.group(1).decode('utf-8')
2062 # Extract video thumbnail
2063 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2065 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2067 video_thumbnail = mobj.group(1).decode('utf-8')
2069 # # Extract video description
2070 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2072 # self._downloader.trouble(u'ERROR: unable to extract video description')
2074 # video_description = mobj.group(1).decode('utf-8')
2075 # if not video_description: video_description = 'No description available.'
2076 video_description = 'Foo.'
2078 # Vimeo specific: extract request signature
2079 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2081 self._downloader.trouble(u'ERROR: unable to extract request signature')
2083 sig = mobj.group(1).decode('utf-8')
2085 # Vimeo specific: extract video quality information
2086 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2088 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2090 quality = mobj.group(1).decode('utf-8')
2092 if int(quality) == 1:
2097 # Vimeo specific: Extract request signature expiration
2098 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2100 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2102 sig_exp = mobj.group(1).decode('utf-8')
2104 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2107 # Process video information
2108 self._downloader.process_info({
2109 'id': video_id.decode('utf-8'),
2111 'uploader': video_uploader,
2112 'upload_date': u'NA',
2113 'title': video_title,
2114 'stitle': simple_title,
2116 'thumbnail': video_thumbnail.decode('utf-8'),
2117 'description': video_description,
2118 'thumbnail': video_thumbnail,
2119 'description': video_description,
2122 except UnavailableVideoError:
2123 self._downloader.trouble(u'ERROR: unable to download video')
2126 class GenericIE(InfoExtractor):
2127 """Generic last-resort information extractor."""
2130 IE_NAME = u'generic'
2132 def __init__(self, downloader=None):
2133 InfoExtractor.__init__(self, downloader)
2135 def report_download_webpage(self, video_id):
2136 """Report webpage download."""
2137 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2138 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2140 def report_extraction(self, video_id):
2141 """Report information extraction."""
2142 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2144 def _real_extract(self, url):
2145 # At this point we have a new video
2146 self._downloader.increment_downloads()
2148 video_id = url.split('/')[-1]
2149 request = urllib2.Request(url)
2151 self.report_download_webpage(video_id)
2152 webpage = urllib2.urlopen(request).read()
2153 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2156 except ValueError, err:
2157 # since this is the last-resort InfoExtractor, if
2158 # this error is thrown, it'll be thrown here
2159 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2162 self.report_extraction(video_id)
2163 # Start with something easy: JW Player in SWFObject
2164 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2166 # Broaden the search a little bit
2167 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2169 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2172 # It's possible that one of the regexes
2173 # matched, but returned an empty group:
2174 if mobj.group(1) is None:
2175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2178 video_url = urllib.unquote(mobj.group(1))
2179 video_id = os.path.basename(video_url)
2181 # here's a fun little line of code for you:
2182 video_extension = os.path.splitext(video_id)[1][1:]
2183 video_id = os.path.splitext(video_id)[0]
2185 # it's tempting to parse this further, but you would
2186 # have to take into account all the variations like
2187 # Video Title - Site Name
2188 # Site Name | Video Title
2189 # Video Title - Tagline | Site Name
2190 # and so on and so forth; it's just not practical
2191 mobj = re.search(r'<title>(.*)</title>', webpage)
2193 self._downloader.trouble(u'ERROR: unable to extract title')
2195 video_title = mobj.group(1).decode('utf-8')
2196 video_title = sanitize_title(video_title)
2197 simple_title = _simplify_title(video_title)
2199 # video uploader is domain name
2200 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2202 self._downloader.trouble(u'ERROR: unable to extract title')
2204 video_uploader = mobj.group(1).decode('utf-8')
2207 # Process video information
2208 self._downloader.process_info({
2209 'id': video_id.decode('utf-8'),
2210 'url': video_url.decode('utf-8'),
2211 'uploader': video_uploader,
2212 'upload_date': u'NA',
2213 'title': video_title,
2214 'stitle': simple_title,
2215 'ext': video_extension.decode('utf-8'),
2219 except UnavailableVideoError, err:
2220 self._downloader.trouble(u'\nERROR: unable to download video')
2223 class YoutubeSearchIE(InfoExtractor):
2224 """Information Extractor for YouTube search queries."""
2225 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2226 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2227 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2228 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2230 _max_youtube_results = 1000
2231 IE_NAME = u'youtube:search'
2233 def __init__(self, youtube_ie, downloader=None):
2234 InfoExtractor.__init__(self, downloader)
2235 self._youtube_ie = youtube_ie
2237 def report_download_page(self, query, pagenum):
2238 """Report attempt to download playlist page with given number."""
2239 query = query.decode(preferredencoding())
2240 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2242 def _real_initialize(self):
2243 self._youtube_ie.initialize()
2245 def _real_extract(self, query):
2246 mobj = re.match(self._VALID_URL, query)
2248 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2251 prefix, query = query.split(':')
2253 query = query.encode('utf-8')
2255 self._download_n_results(query, 1)
2257 elif prefix == 'all':
2258 self._download_n_results(query, self._max_youtube_results)
2264 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2266 elif n > self._max_youtube_results:
2267 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2268 n = self._max_youtube_results
2269 self._download_n_results(query, n)
2271 except ValueError: # parsing prefix as integer fails
2272 self._download_n_results(query, 1)
2275 def _download_n_results(self, query, n):
2276 """Downloads a specified number of results for a query"""
2279 already_seen = set()
2283 self.report_download_page(query, pagenum)
2284 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2285 request = urllib2.Request(result_url)
2287 page = urllib2.urlopen(request).read()
2288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2289 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2292 # Extract video identifiers
2293 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2294 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2295 if video_id not in already_seen:
2296 video_ids.append(video_id)
2297 already_seen.add(video_id)
2298 if len(video_ids) == n:
2299 # Specified n videos reached
2300 for id in video_ids:
2301 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2304 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2305 for id in video_ids:
2306 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2309 pagenum = pagenum + 1
2312 class GoogleSearchIE(InfoExtractor):
2313 """Information Extractor for Google Video search queries."""
2314 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2315 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2316 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2317 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2319 _max_google_results = 1000
2320 IE_NAME = u'video.google:search'
2322 def __init__(self, google_ie, downloader=None):
2323 InfoExtractor.__init__(self, downloader)
2324 self._google_ie = google_ie
2326 def report_download_page(self, query, pagenum):
2327 """Report attempt to download playlist page with given number."""
2328 query = query.decode(preferredencoding())
2329 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2331 def _real_initialize(self):
2332 self._google_ie.initialize()
2334 def _real_extract(self, query):
2335 mobj = re.match(self._VALID_URL, query)
2337 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2340 prefix, query = query.split(':')
2342 query = query.encode('utf-8')
2344 self._download_n_results(query, 1)
2346 elif prefix == 'all':
2347 self._download_n_results(query, self._max_google_results)
2353 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2355 elif n > self._max_google_results:
2356 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2357 n = self._max_google_results
2358 self._download_n_results(query, n)
2360 except ValueError: # parsing prefix as integer fails
2361 self._download_n_results(query, 1)
2364 def _download_n_results(self, query, n):
2365 """Downloads a specified number of results for a query"""
2368 already_seen = set()
2372 self.report_download_page(query, pagenum)
2373 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2374 request = urllib2.Request(result_url)
2376 page = urllib2.urlopen(request).read()
2377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2381 # Extract video identifiers
2382 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2383 video_id = mobj.group(1)
2384 if video_id not in already_seen:
2385 video_ids.append(video_id)
2386 already_seen.add(video_id)
2387 if len(video_ids) == n:
2388 # Specified n videos reached
2389 for id in video_ids:
2390 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2393 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2394 for id in video_ids:
2395 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2398 pagenum = pagenum + 1
2401 class YahooSearchIE(InfoExtractor):
2402 """Information Extractor for Yahoo! Video search queries."""
2403 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2404 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2405 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2406 _MORE_PAGES_INDICATOR = r'\s*Next'
2408 _max_yahoo_results = 1000
2409 IE_NAME = u'video.yahoo:search'
2411 def __init__(self, yahoo_ie, downloader=None):
2412 InfoExtractor.__init__(self, downloader)
2413 self._yahoo_ie = yahoo_ie
2415 def report_download_page(self, query, pagenum):
2416 """Report attempt to download playlist page with given number."""
2417 query = query.decode(preferredencoding())
2418 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2420 def _real_initialize(self):
2421 self._yahoo_ie.initialize()
2423 def _real_extract(self, query):
2424 mobj = re.match(self._VALID_URL, query)
2426 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2429 prefix, query = query.split(':')
2431 query = query.encode('utf-8')
2433 self._download_n_results(query, 1)
2435 elif prefix == 'all':
2436 self._download_n_results(query, self._max_yahoo_results)
2442 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2444 elif n > self._max_yahoo_results:
2445 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2446 n = self._max_yahoo_results
2447 self._download_n_results(query, n)
2449 except ValueError: # parsing prefix as integer fails
2450 self._download_n_results(query, 1)
2453 def _download_n_results(self, query, n):
2454 """Downloads a specified number of results for a query"""
2457 already_seen = set()
2461 self.report_download_page(query, pagenum)
2462 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2463 request = urllib2.Request(result_url)
2465 page = urllib2.urlopen(request).read()
2466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2470 # Extract video identifiers
2471 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2472 video_id = mobj.group(1)
2473 if video_id not in already_seen:
2474 video_ids.append(video_id)
2475 already_seen.add(video_id)
2476 if len(video_ids) == n:
2477 # Specified n videos reached
2478 for id in video_ids:
2479 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483 for id in video_ids:
2484 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2487 pagenum = pagenum + 1
2490 class YoutubePlaylistIE(InfoExtractor):
2491 """Information Extractor for YouTube playlists."""
2493 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2494 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2495 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2496 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2498 IE_NAME = u'youtube:playlist'
2500 def __init__(self, youtube_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._youtube_ie = youtube_ie
2504 def report_download_page(self, playlist_id, pagenum):
2505 """Report attempt to download playlist page with given number."""
2506 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2508 def _real_initialize(self):
2509 self._youtube_ie.initialize()
2511 def _real_extract(self, url):
2512 # Extract playlist id
2513 mobj = re.match(self._VALID_URL, url)
2515 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2519 if mobj.group(3) is not None:
2520 self._youtube_ie.extract(mobj.group(3))
2523 # Download playlist pages
2524 # prefix is 'p' as default for playlists but there are other types that need extra care
2525 playlist_prefix = mobj.group(1)
2526 if playlist_prefix == 'a':
2527 playlist_access = 'artist'
2529 playlist_prefix = 'p'
2530 playlist_access = 'view_play_list'
2531 playlist_id = mobj.group(2)
2536 self.report_download_page(playlist_id, pagenum)
2537 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2538 request = urllib2.Request(url)
2540 page = urllib2.urlopen(request).read()
2541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2545 # Extract video identifiers
2547 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2548 if mobj.group(1) not in ids_in_page:
2549 ids_in_page.append(mobj.group(1))
2550 video_ids.extend(ids_in_page)
2552 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2554 pagenum = pagenum + 1
2556 playliststart = self._downloader.params.get('playliststart', 1) - 1
2557 playlistend = self._downloader.params.get('playlistend', -1)
2558 video_ids = video_ids[playliststart:playlistend]
2560 for id in video_ids:
2561 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2565 class YoutubeUserIE(InfoExtractor):
2566 """Information Extractor for YouTube users."""
2568 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2569 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2570 _GDATA_PAGE_SIZE = 50
2571 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2572 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2574 IE_NAME = u'youtube:user'
2576 def __init__(self, youtube_ie, downloader=None):
2577 InfoExtractor.__init__(self, downloader)
2578 self._youtube_ie = youtube_ie
2580 def report_download_page(self, username, start_index):
2581 """Report attempt to download user page."""
2582 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2583 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2585 def _real_initialize(self):
2586 self._youtube_ie.initialize()
2588 def _real_extract(self, url):
2590 mobj = re.match(self._VALID_URL, url)
2592 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2595 username = mobj.group(1)
2597 # Download video ids using YouTube Data API. Result size per
2598 # query is limited (currently to 50 videos) so we need to query
2599 # page by page until there are no video ids - it means we got
2606 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2607 self.report_download_page(username, start_index)
2609 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2612 page = urllib2.urlopen(request).read()
2613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2614 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2617 # Extract video identifiers
2620 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2621 if mobj.group(1) not in ids_in_page:
2622 ids_in_page.append(mobj.group(1))
2624 video_ids.extend(ids_in_page)
2626 # A little optimization - if current page is not
2627 # "full", ie. does not contain PAGE_SIZE video ids then
2628 # we can assume that this page is the last one - there
2629 # are no more ids on further pages - no need to query
2632 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2637 all_ids_count = len(video_ids)
2638 playliststart = self._downloader.params.get('playliststart', 1) - 1
2639 playlistend = self._downloader.params.get('playlistend', -1)
2641 if playlistend == -1:
2642 video_ids = video_ids[playliststart:]
2644 video_ids = video_ids[playliststart:playlistend]
2646 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2647 (username, all_ids_count, len(video_ids)))
2649 for video_id in video_ids:
2650 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2653 class DepositFilesIE(InfoExtractor):
2654 """Information extractor for depositfiles.com"""
2656 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2657 IE_NAME = u'DepositFiles'
2659 def __init__(self, downloader=None):
2660 InfoExtractor.__init__(self, downloader)
2662 def report_download_webpage(self, file_id):
2663 """Report webpage download."""
2664 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2666 def report_extraction(self, file_id):
2667 """Report information extraction."""
2668 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2670 def _real_extract(self, url):
2671 # At this point we have a new file
2672 self._downloader.increment_downloads()
2674 file_id = url.split('/')[-1]
2675 # Rebuild url in english locale
2676 url = 'http://depositfiles.com/en/files/' + file_id
2678 # Retrieve file webpage with 'Free download' button pressed
2679 free_download_indication = { 'gateway_result' : '1' }
2680 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2682 self.report_download_webpage(file_id)
2683 webpage = urllib2.urlopen(request).read()
2684 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2685 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2688 # Search for the real file URL
2689 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2690 if (mobj is None) or (mobj.group(1) is None):
2691 # Try to figure out reason of the error.
2692 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2693 if (mobj is not None) and (mobj.group(1) is not None):
2694 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2695 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2697 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2700 file_url = mobj.group(1)
2701 file_extension = os.path.splitext(file_url)[1][1:]
2703 # Search for file title
2704 mobj = re.search(r'<b title="(.*?)">', webpage)
2706 self._downloader.trouble(u'ERROR: unable to extract title')
2708 file_title = mobj.group(1).decode('utf-8')
2711 # Process file information
2712 self._downloader.process_info({
2713 'id': file_id.decode('utf-8'),
2714 'url': file_url.decode('utf-8'),
2716 'upload_date': u'NA',
2717 'title': file_title,
2718 'stitle': file_title,
2719 'ext': file_extension.decode('utf-8'),
2723 except UnavailableVideoError, err:
2724 self._downloader.trouble(u'ERROR: unable to download file')
2727 class FacebookIE(InfoExtractor):
2728 """Information Extractor for Facebook"""
2730 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2731 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2732 _NETRC_MACHINE = 'facebook'
2733 _available_formats = ['video', 'highqual', 'lowqual']
2734 _video_extensions = {
2739 IE_NAME = u'facebook'
2741 def __init__(self, downloader=None):
2742 InfoExtractor.__init__(self, downloader)
2744 def _reporter(self, message):
2745 """Add header and report message."""
2746 self._downloader.to_screen(u'[facebook] %s' % message)
2748 def report_login(self):
2749 """Report attempt to log in."""
2750 self._reporter(u'Logging in')
2752 def report_video_webpage_download(self, video_id):
2753 """Report attempt to download video webpage."""
2754 self._reporter(u'%s: Downloading video webpage' % video_id)
2756 def report_information_extraction(self, video_id):
2757 """Report attempt to extract video information."""
2758 self._reporter(u'%s: Extracting video information' % video_id)
2760 def _parse_page(self, video_webpage):
2761 """Extract video information from page"""
2763 data = {'title': r'\("video_title", "(.*?)"\)',
2764 'description': r'<div class="datawrap">(.*?)</div>',
2765 'owner': r'\("video_owner_name", "(.*?)"\)',
2766 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2769 for piece in data.keys():
2770 mobj = re.search(data[piece], video_webpage)
2771 if mobj is not None:
2772 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2776 for fmt in self._available_formats:
2777 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2778 if mobj is not None:
2779 # URL is in a Javascript segment inside an escaped Unicode format within
2780 # the generally utf-8 page
2781 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2782 video_info['video_urls'] = video_urls
2786 def _real_initialize(self):
2787 if self._downloader is None:
2792 downloader_params = self._downloader.params
2794 # Attempt to use provided username and password or .netrc data
2795 if downloader_params.get('username', None) is not None:
2796 useremail = downloader_params['username']
2797 password = downloader_params['password']
2798 elif downloader_params.get('usenetrc', False):
2800 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2801 if info is not None:
2805 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2806 except (IOError, netrc.NetrcParseError), err:
2807 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2810 if useremail is None:
2819 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2822 login_results = urllib2.urlopen(request).read()
2823 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2824 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2826 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2830 def _real_extract(self, url):
2831 mobj = re.match(self._VALID_URL, url)
2833 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2835 video_id = mobj.group('ID')
2838 self.report_video_webpage_download(video_id)
2839 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2841 page = urllib2.urlopen(request)
2842 video_webpage = page.read()
2843 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2844 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2847 # Start extracting information
2848 self.report_information_extraction(video_id)
2850 # Extract information
2851 video_info = self._parse_page(video_webpage)
2854 if 'owner' not in video_info:
2855 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2857 video_uploader = video_info['owner']
2860 if 'title' not in video_info:
2861 self._downloader.trouble(u'ERROR: unable to extract video title')
2863 video_title = video_info['title']
2864 video_title = video_title.decode('utf-8')
2865 video_title = sanitize_title(video_title)
2867 simple_title = _simplify_title(video_title)
2870 if 'thumbnail' not in video_info:
2871 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2872 video_thumbnail = ''
2874 video_thumbnail = video_info['thumbnail']
2878 if 'upload_date' in video_info:
2879 upload_time = video_info['upload_date']
2880 timetuple = email.utils.parsedate_tz(upload_time)
2881 if timetuple is not None:
2883 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2888 video_description = video_info.get('description', 'No description available.')
2890 url_map = video_info['video_urls']
2891 if len(url_map.keys()) > 0:
2892 # Decide which formats to download
2893 req_format = self._downloader.params.get('format', None)
2894 format_limit = self._downloader.params.get('format_limit', None)
2896 if format_limit is not None and format_limit in self._available_formats:
2897 format_list = self._available_formats[self._available_formats.index(format_limit):]
2899 format_list = self._available_formats
2900 existing_formats = [x for x in format_list if x in url_map]
2901 if len(existing_formats) == 0:
2902 self._downloader.trouble(u'ERROR: no known formats available for video')
2904 if req_format is None:
2905 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2906 elif req_format == 'worst':
2907 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2908 elif req_format == '-1':
2909 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2912 if req_format not in url_map:
2913 self._downloader.trouble(u'ERROR: requested format not available')
2915 video_url_list = [(req_format, url_map[req_format])] # Specific format
2917 for format_param, video_real_url in video_url_list:
2919 # At this point we have a new video
2920 self._downloader.increment_downloads()
2923 video_extension = self._video_extensions.get(format_param, 'mp4')
2926 # Process video information
2927 self._downloader.process_info({
2928 'id': video_id.decode('utf-8'),
2929 'url': video_real_url.decode('utf-8'),
2930 'uploader': video_uploader.decode('utf-8'),
2931 'upload_date': upload_date,
2932 'title': video_title,
2933 'stitle': simple_title,
2934 'ext': video_extension.decode('utf-8'),
2935 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2936 'thumbnail': video_thumbnail.decode('utf-8'),
2937 'description': video_description.decode('utf-8'),
2940 except UnavailableVideoError, err:
2941 self._downloader.trouble(u'\nERROR: unable to download video')
2943 class BlipTVIE(InfoExtractor):
2944 """Information extractor for blip.tv"""
2946 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2947 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2948 IE_NAME = u'blip.tv'
2950 def report_extraction(self, file_id):
2951 """Report information extraction."""
2952 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2954 def report_direct_download(self, title):
2955 """Report information extraction."""
2956 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2958 def _real_extract(self, url):
2959 mobj = re.match(self._VALID_URL, url)
2961 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2968 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2969 request = urllib2.Request(json_url)
2970 self.report_extraction(mobj.group(1))
2973 urlh = urllib2.urlopen(request)
2974 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2975 basename = url.split('/')[-1]
2976 title,ext = os.path.splitext(basename)
2977 title = title.decode('UTF-8')
2978 ext = ext.replace('.', '')
2979 self.report_direct_download(title)
2984 'stitle': _simplify_title(title),
2988 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2991 if info is None: # Regular URL
2993 json_code = urlh.read()
2994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2999 json_data = json.loads(json_code)
3000 if 'Post' in json_data:
3001 data = json_data['Post']
3005 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3006 video_url = data['media']['url']
3007 umobj = re.match(self._URL_EXT, video_url)
3009 raise ValueError('Can not determine filename extension')
3010 ext = umobj.group(1)
3013 'id': data['item_id'],
3015 'uploader': data['display_name'],
3016 'upload_date': upload_date,
3017 'title': data['title'],
3018 'stitle': _simplify_title(data['title']),
3020 'format': data['media']['mimeType'],
3021 'thumbnail': data['thumbnailUrl'],
3022 'description': data['description'],
3023 'player_url': data['embedUrl']
3025 except (ValueError,KeyError), err:
3026 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3029 self._downloader.increment_downloads()
3032 self._downloader.process_info(info)
3033 except UnavailableVideoError, err:
3034 self._downloader.trouble(u'\nERROR: unable to download video')
3037 class MyVideoIE(InfoExtractor):
3038 """Information Extractor for myvideo.de."""
3040 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3041 IE_NAME = u'myvideo'
3043 def __init__(self, downloader=None):
3044 InfoExtractor.__init__(self, downloader)
3046 def report_download_webpage(self, video_id):
3047 """Report webpage download."""
3048 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3050 def report_extraction(self, video_id):
3051 """Report information extraction."""
3052 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3054 def _real_extract(self,url):
3055 mobj = re.match(self._VALID_URL, url)
3057 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3060 video_id = mobj.group(1)
3063 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3065 self.report_download_webpage(video_id)
3066 webpage = urllib2.urlopen(request).read()
3067 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3068 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3071 self.report_extraction(video_id)
3072 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3075 self._downloader.trouble(u'ERROR: unable to extract media URL')
3077 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3079 mobj = re.search('<title>([^<]+)</title>', webpage)
3081 self._downloader.trouble(u'ERROR: unable to extract title')
3084 video_title = mobj.group(1)
3085 video_title = sanitize_title(video_title)
3087 simple_title = _simplify_title(video_title)
3090 self._downloader.process_info({
3094 'upload_date': u'NA',
3095 'title': video_title,
3096 'stitle': simple_title,
3101 except UnavailableVideoError:
3102 self._downloader.trouble(u'\nERROR: Unable to download video')
3104 class ComedyCentralIE(InfoExtractor):
3105 """Information extractor for The Daily Show and Colbert Report """
3107 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3108 IE_NAME = u'comedycentral'
3110 def report_extraction(self, episode_id):
3111 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3113 def report_config_download(self, episode_id):
3114 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3116 def report_index_download(self, episode_id):
3117 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3119 def report_player_url(self, episode_id):
3120 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3122 def _real_extract(self, url):
3123 mobj = re.match(self._VALID_URL, url)
3125 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3128 if mobj.group('shortname'):
3129 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3130 url = u'http://www.thedailyshow.com/full-episodes/'
3132 url = u'http://www.colbertnation.com/full-episodes/'
3133 mobj = re.match(self._VALID_URL, url)
3134 assert mobj is not None
3136 dlNewest = not mobj.group('episode')
3138 epTitle = mobj.group('showname')
3140 epTitle = mobj.group('episode')
3142 req = urllib2.Request(url)
3143 self.report_extraction(epTitle)
3145 htmlHandle = urllib2.urlopen(req)
3146 html = htmlHandle.read()
3147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3151 url = htmlHandle.geturl()
3152 mobj = re.match(self._VALID_URL, url)
3154 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3156 if mobj.group('episode') == '':
3157 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3159 epTitle = mobj.group('episode')
3161 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3162 if len(mMovieParams) == 0:
3163 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3166 playerUrl_raw = mMovieParams[0][0]
3167 self.report_player_url(epTitle)
3169 urlHandle = urllib2.urlopen(playerUrl_raw)
3170 playerUrl = urlHandle.geturl()
3171 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3172 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3175 uri = mMovieParams[0][1]
3176 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3177 self.report_index_download(epTitle)
3179 indexXml = urllib2.urlopen(indexUrl).read()
3180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3181 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3184 idoc = xml.etree.ElementTree.fromstring(indexXml)
3185 itemEls = idoc.findall('.//item')
3186 for itemEl in itemEls:
3187 mediaId = itemEl.findall('./guid')[0].text
3188 shortMediaId = mediaId.split(':')[-1]
3189 showId = mediaId.split(':')[-2].replace('.com', '')
3190 officialTitle = itemEl.findall('./title')[0].text
3191 officialDate = itemEl.findall('./pubDate')[0].text
3193 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3194 urllib.urlencode({'uri': mediaId}))
3195 configReq = urllib2.Request(configUrl)
3196 self.report_config_download(epTitle)
3198 configXml = urllib2.urlopen(configReq).read()
3199 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3203 cdoc = xml.etree.ElementTree.fromstring(configXml)
3205 for rendition in cdoc.findall('.//rendition'):
3206 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3210 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3213 # For now, just pick the highest bitrate
3214 format,video_url = turls[-1]
3216 self._downloader.increment_downloads()
3218 effTitle = showId + u'-' + epTitle
3223 'upload_date': officialDate,
3225 'stitle': _simplify_title(effTitle),
3229 'description': officialTitle,
3230 'player_url': playerUrl
3234 self._downloader.process_info(info)
3235 except UnavailableVideoError, err:
3236 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3240 class EscapistIE(InfoExtractor):
3241 """Information extractor for The Escapist """
3243 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3244 IE_NAME = u'escapist'
3246 def report_extraction(self, showName):
3247 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3249 def report_config_download(self, showName):
3250 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3252 def _real_extract(self, url):
3253 htmlParser = HTMLParser.HTMLParser()
3255 mobj = re.match(self._VALID_URL, url)
3257 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3259 showName = mobj.group('showname')
3260 videoId = mobj.group('episode')
3262 self.report_extraction(showName)
3264 webPage = urllib2.urlopen(url).read()
3265 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3269 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3270 description = htmlParser.unescape(descMatch.group(1))
3271 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3272 imgUrl = htmlParser.unescape(imgMatch.group(1))
3273 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3274 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3275 configUrlMatch = re.search('config=(.*)$', playerUrl)
3276 configUrl = urllib2.unquote(configUrlMatch.group(1))
3278 self.report_config_download(showName)
3280 configJSON = urllib2.urlopen(configUrl).read()
3281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3282 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3285 # Technically, it's JavaScript, not JSON
3286 configJSON = configJSON.replace("'", '"')
3289 config = json.loads(configJSON)
3290 except (ValueError,), err:
3291 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3294 playlist = config['playlist']
3295 videoUrl = playlist[1]['url']
3297 self._downloader.increment_downloads()
3301 'uploader': showName,
3302 'upload_date': None,
3304 'stitle': _simplify_title(showName),
3307 'thumbnail': imgUrl,
3308 'description': description,
3309 'player_url': playerUrl,
3313 self._downloader.process_info(info)
3314 except UnavailableVideoError, err:
3315 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3318 class CollegeHumorIE(InfoExtractor):
3319 """Information extractor for collegehumor.com"""
3321 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3322 IE_NAME = u'collegehumor'
3324 def report_webpage(self, video_id):
3325 """Report information extraction."""
3326 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3328 def report_extraction(self, video_id):
3329 """Report information extraction."""
3330 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3332 def _real_extract(self, url):
3333 htmlParser = HTMLParser.HTMLParser()
3335 mobj = re.match(self._VALID_URL, url)
3337 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3339 video_id = mobj.group('videoid')
3341 self.report_webpage(video_id)
3342 request = urllib2.Request(url)
3344 webpage = urllib2.urlopen(request).read()
3345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3346 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3349 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3351 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3353 internal_video_id = m.group('internalvideoid')
3357 'internal_id': internal_video_id,
3360 self.report_extraction(video_id)
3361 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3363 metaXml = urllib2.urlopen(xmlUrl).read()
3364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3365 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3368 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3370 videoNode = mdoc.findall('./video')[0]
3371 info['description'] = videoNode.findall('./description')[0].text
3372 info['title'] = videoNode.findall('./caption')[0].text
3373 info['stitle'] = _simplify_title(info['title'])
3374 info['url'] = videoNode.findall('./file')[0].text
3375 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3376 info['ext'] = info['url'].rpartition('.')[2]
3377 info['format'] = info['ext']
3379 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3382 self._downloader.increment_downloads()
3385 self._downloader.process_info(info)
3386 except UnavailableVideoError, err:
3387 self._downloader.trouble(u'\nERROR: unable to download video')
3390 class XVideosIE(InfoExtractor):
3391 """Information extractor for xvideos.com"""
3393 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3394 IE_NAME = u'xvideos'
3396 def report_webpage(self, video_id):
3397 """Report information extraction."""
3398 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3400 def report_extraction(self, video_id):
3401 """Report information extraction."""
3402 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3404 def _real_extract(self, url):
3405 htmlParser = HTMLParser.HTMLParser()
3407 mobj = re.match(self._VALID_URL, url)
3409 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3411 video_id = mobj.group(1).decode('utf-8')
3413 self.report_webpage(video_id)
3415 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3417 webpage = urllib2.urlopen(request).read()
3418 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3419 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3422 self.report_extraction(video_id)
3426 mobj = re.search(r'flv_url=(.+?)&', webpage)
3428 self._downloader.trouble(u'ERROR: unable to extract video url')
3430 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3434 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3436 self._downloader.trouble(u'ERROR: unable to extract video title')
3438 video_title = mobj.group(1).decode('utf-8')
3441 # Extract video thumbnail
3442 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3444 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3446 video_thumbnail = mobj.group(1).decode('utf-8')
3450 self._downloader.increment_downloads()
3455 'upload_date': None,
3456 'title': video_title,
3457 'stitle': _simplify_title(video_title),
3460 'thumbnail': video_thumbnail,
3461 'description': None,
3466 self._downloader.process_info(info)
3467 except UnavailableVideoError, err:
3468 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3471 class SoundcloudIE(InfoExtractor):
3472 """Information extractor for soundcloud.com
3473 To access the media, the uid of the song and a stream token
3474 must be extracted from the page source and the script must make
3475 a request to media.soundcloud.com/crossdomain.xml. Then
3476 the media can be grabbed by requesting from an url composed
3477 of the stream token and uid
3480 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3481 IE_NAME = u'soundcloud'
3483 def __init__(self, downloader=None):
3484 InfoExtractor.__init__(self, downloader)
3486 def report_webpage(self, video_id):
3487 """Report information extraction."""
3488 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3490 def report_extraction(self, video_id):
3491 """Report information extraction."""
3492 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3494 def _real_extract(self, url):
3495 htmlParser = HTMLParser.HTMLParser()
3497 mobj = re.match(self._VALID_URL, url)
3499 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3502 # extract uploader (which is in the url)
3503 uploader = mobj.group(1).decode('utf-8')
3504 # extract simple title (uploader + slug of song title)
3505 slug_title = mobj.group(2).decode('utf-8')
3506 simple_title = uploader + '-' + slug_title
3508 self.report_webpage('%s/%s' % (uploader, slug_title))
3510 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3512 webpage = urllib2.urlopen(request).read()
3513 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3514 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3517 self.report_extraction('%s/%s' % (uploader, slug_title))
3519 # extract uid and stream token that soundcloud hands out for access
3520 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3522 video_id = mobj.group(1)
3523 stream_token = mobj.group(2)
3525 # extract unsimplified title
3526 mobj = re.search('"title":"(.*?)",', webpage)
3528 title = mobj.group(1)
3530 # construct media url (with uid/token)
3531 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3532 mediaURL = mediaURL % (video_id, stream_token)
3535 description = u'No description available'
3536 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3538 description = mobj.group(1)
3542 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3545 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3546 except Exception, e:
3549 # for soundcloud, a request to a cross domain is required for cookies
3550 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3553 self._downloader.process_info({
3554 'id': video_id.decode('utf-8'),
3556 'uploader': uploader.decode('utf-8'),
3557 'upload_date': upload_date,
3558 'title': simple_title.decode('utf-8'),
3559 'stitle': simple_title.decode('utf-8'),
3563 'description': description.decode('utf-8')
3565 except UnavailableVideoError:
3566 self._downloader.trouble(u'\nERROR: unable to download video')
3569 class InfoQIE(InfoExtractor):
3570 """Information extractor for infoq.com"""
3572 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3575 def report_webpage(self, video_id):
3576 """Report information extraction."""
3577 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3579 def report_extraction(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3583 def _real_extract(self, url):
3584 htmlParser = HTMLParser.HTMLParser()
3586 mobj = re.match(self._VALID_URL, url)
3588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3591 self.report_webpage(url)
3593 request = urllib2.Request(url)
3595 webpage = urllib2.urlopen(request).read()
3596 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3597 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3600 self.report_extraction(url)
3604 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3606 self._downloader.trouble(u'ERROR: unable to extract video url')
3608 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3612 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3614 self._downloader.trouble(u'ERROR: unable to extract video title')
3616 video_title = mobj.group(1).decode('utf-8')
3618 # Extract description
3619 video_description = u'No description available.'
3620 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3621 if mobj is not None:
3622 video_description = mobj.group(1).decode('utf-8')
3624 video_filename = video_url.split('/')[-1]
3625 video_id, extension = video_filename.split('.')
3627 self._downloader.increment_downloads()
3632 'upload_date': None,
3633 'title': video_title,
3634 'stitle': _simplify_title(video_title),
3636 'format': extension, # Extension is always(?) mp4, but seems to be flv
3638 'description': video_description,
3643 self._downloader.process_info(info)
3644 except UnavailableVideoError, err:
3645 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3647 class MixcloudIE(InfoExtractor):
3648 """Information extractor for www.mixcloud.com"""
3649 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3650 IE_NAME = u'mixcloud'
3652 def __init__(self, downloader=None):
3653 InfoExtractor.__init__(self, downloader)
3655 def report_download_json(self, file_id):
3656 """Report JSON download."""
3657 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3659 def report_extraction(self, file_id):
3660 """Report information extraction."""
3661 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3663 def get_urls(self, jsonData, fmt, bitrate='best'):
3664 """Get urls from 'audio_formats' section in json"""
3667 bitrate_list = jsonData[fmt]
3668 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3669 bitrate = max(bitrate_list) # select highest
3671 url_list = jsonData[fmt][bitrate]
3672 except TypeError: # we have no bitrate info.
3673 url_list = jsonData[fmt]
3677 def check_urls(self, url_list):
3678 """Returns 1st active url from list"""
3679 for url in url_list:
3681 urllib2.urlopen(url)
3683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3688 def _print_formats(self, formats):
3689 print 'Available formats:'
3690 for fmt in formats.keys():
3691 for b in formats[fmt]:
3693 ext = formats[fmt][b][0]
3694 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3695 except TypeError: # we have no bitrate info
3696 ext = formats[fmt][0]
3697 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3700 def _real_extract(self, url):
3701 mobj = re.match(self._VALID_URL, url)
3703 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3705 # extract uploader & filename from url
3706 uploader = mobj.group(1).decode('utf-8')
3707 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3709 # construct API request
3710 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3711 # retrieve .json file with links to files
3712 request = urllib2.Request(file_url)
3714 self.report_download_json(file_url)
3715 jsonData = urllib2.urlopen(request).read()
3716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3717 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3721 json_data = json.loads(jsonData)
3722 player_url = json_data['player_swf_url']
3723 formats = dict(json_data['audio_formats'])
3725 req_format = self._downloader.params.get('format', None)
3728 if self._downloader.params.get('listformats', None):
3729 self._print_formats(formats)
3732 if req_format is None or req_format == 'best':
3733 for format_param in formats.keys():
3734 url_list = self.get_urls(formats, format_param)
3736 file_url = self.check_urls(url_list)
3737 if file_url is not None:
3740 if req_format not in formats.keys():
3741 self._downloader.trouble(u'ERROR: format is not available')
3744 url_list = self.get_urls(formats, req_format)
3745 file_url = self.check_urls(url_list)
3746 format_param = req_format
3749 self._downloader.increment_downloads()
3751 # Process file information
3752 self._downloader.process_info({
3753 'id': file_id.decode('utf-8'),
3754 'url': file_url.decode('utf-8'),
3755 'uploader': uploader.decode('utf-8'),
3756 'upload_date': u'NA',
3757 'title': json_data['name'],
3758 'stitle': _simplify_title(json_data['name']),
3759 'ext': file_url.split('.')[-1].decode('utf-8'),
3760 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3761 'thumbnail': json_data['thumbnail_url'],
3762 'description': json_data['description'],
3763 'player_url': player_url.decode('utf-8'),
3765 except UnavailableVideoError, err:
3766 self._downloader.trouble(u'ERROR: unable to download file')
3768 class StanfordOpenClassroomIE(InfoExtractor):
3769 """Information extractor for Stanford's Open ClassRoom"""
3771 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3772 IE_NAME = u'stanfordoc'
3774 def report_download_webpage(self, objid):
3775 """Report information extraction."""
3776 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3778 def report_extraction(self, video_id):
3779 """Report information extraction."""
3780 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3782 def _real_extract(self, url):
3783 mobj = re.match(self._VALID_URL, url)
3785 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3788 if mobj.group('course') and mobj.group('video'): # A specific video
3789 course = mobj.group('course')
3790 video = mobj.group('video')
3792 'id': _simplify_title(course + '_' + video),
3795 self.report_extraction(info['id'])
3796 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3797 xmlUrl = baseUrl + video + '.xml'
3799 metaXml = urllib2.urlopen(xmlUrl).read()
3800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3801 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3803 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3805 info['title'] = mdoc.findall('./title')[0].text
3806 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3808 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3810 info['stitle'] = _simplify_title(info['title'])
3811 info['ext'] = info['url'].rpartition('.')[2]
3812 info['format'] = info['ext']
3813 self._downloader.increment_downloads()
3815 self._downloader.process_info(info)
3816 except UnavailableVideoError, err:
3817 self._downloader.trouble(u'\nERROR: unable to download video')
3818 elif mobj.group('course'): # A course page
3819 unescapeHTML = HTMLParser.HTMLParser().unescape
3821 course = mobj.group('course')
3823 'id': _simplify_title(course),
3827 self.report_download_webpage(info['id'])
3829 coursepage = urllib2.urlopen(url).read()
3830 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3831 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3834 m = re.search('<h1>([^<]+)</h1>', coursepage)
3836 info['title'] = unescapeHTML(m.group(1))
3838 info['title'] = info['id']
3839 info['stitle'] = _simplify_title(info['title'])
3841 m = re.search('<description>([^<]+)</description>', coursepage)
3843 info['description'] = unescapeHTML(m.group(1))
3845 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3848 'type': 'reference',
3849 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3853 for entry in info['list']:
3854 assert entry['type'] == 'reference'
3855 self.extract(entry['url'])
3857 unescapeHTML = HTMLParser.HTMLParser().unescape
3860 'id': 'Stanford OpenClassroom',
3864 self.report_download_webpage(info['id'])
3865 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3867 rootpage = urllib2.urlopen(rootURL).read()
3868 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3869 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3872 info['title'] = info['id']
3873 info['stitle'] = _simplify_title(info['title'])
3875 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3878 'type': 'reference',
3879 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3883 for entry in info['list']:
3884 assert entry['type'] == 'reference'
3885 self.extract(entry['url'])
3888 class PostProcessor(object):
3889 """Post Processor class.
3891 PostProcessor objects can be added to downloaders with their
3892 add_post_processor() method. When the downloader has finished a
3893 successful download, it will take its internal chain of PostProcessors
3894 and start calling the run() method on each one of them, first with
3895 an initial argument and then with the returned value of the previous
3898 The chain will be stopped if one of them ever returns None or the end
3899 of the chain is reached.
3901 PostProcessor objects follow a "mutual registration" process similar
3902 to InfoExtractor objects.
3907 def __init__(self, downloader=None):
3908 self._downloader = downloader
3910 def set_downloader(self, downloader):
3911 """Sets the downloader for this PP."""
3912 self._downloader = downloader
3914 def run(self, information):
3915 """Run the PostProcessor.
3917 The "information" argument is a dictionary like the ones
3918 composed by InfoExtractors. The only difference is that this
3919 one has an extra field called "filepath" that points to the
3922 When this method returns None, the postprocessing chain is
3923 stopped. However, this method may return an information
3924 dictionary that will be passed to the next postprocessing
3925 object in the chain. It can be the one it received after
3926 changing some fields.
3928 In addition, this method may raise a PostProcessingError
3929 exception that will be taken into account by the downloader
3932 return information # by default, do nothing
3934 class AudioConversionError(BaseException):
3935 def __init__(self, message):
3936 self.message = message
3938 class FFmpegExtractAudioPP(PostProcessor):
3940 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3941 PostProcessor.__init__(self, downloader)
3942 if preferredcodec is None:
3943 preferredcodec = 'best'
3944 self._preferredcodec = preferredcodec
3945 self._preferredquality = preferredquality
3946 self._keepvideo = keepvideo
3949 def get_audio_codec(path):
3951 cmd = ['ffprobe', '-show_streams', '--', path]
3952 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3953 output = handle.communicate()[0]
3954 if handle.wait() != 0:
3956 except (IOError, OSError):
3959 for line in output.split('\n'):
3960 if line.startswith('codec_name='):
3961 audio_codec = line.split('=')[1].strip()
3962 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3967 def run_ffmpeg(path, out_path, codec, more_opts):
3971 acodec_opts = ['-acodec', codec]
3972 cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path]
3974 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3975 stdout,stderr = p.communicate()
3976 except (IOError, OSError):
3977 e = sys.exc_info()[1]
3978 if isinstance(e, OSError) and e.errno == 2:
3979 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
3982 if p.returncode != 0:
3983 msg = stderr.strip().split('\n')[-1]
3984 raise AudioConversionError(msg)
3986 def run(self, information):
3987 path = information['filepath']
3989 filecodec = self.get_audio_codec(path)
3990 if filecodec is None:
3991 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3995 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3996 if self._preferredcodec == 'm4a' and filecodec == 'aac':
3997 # Lossless, but in another container
3999 extension = self._preferredcodec
4000 more_opts = ['-absf', 'aac_adtstoasc']
4001 elif filecodec in ['aac', 'mp3', 'vorbis']:
4002 # Lossless if possible
4004 extension = filecodec
4005 if filecodec == 'aac':
4006 more_opts = ['-f', 'adts']
4007 if filecodec == 'vorbis':
4011 acodec = 'libmp3lame'
4014 if self._preferredquality is not None:
4015 more_opts += ['-ab', self._preferredquality]
4017 # We convert the audio (lossy)
4018 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4019 extension = self._preferredcodec
4021 if self._preferredquality is not None:
4022 more_opts += ['-ab', self._preferredquality]
4023 if self._preferredcodec == 'aac':
4024 more_opts += ['-f', 'adts']
4025 if self._preferredcodec == 'm4a':
4026 more_opts += ['-absf', 'aac_adtstoasc']
4027 if self._preferredcodec == 'vorbis':
4029 if self._preferredcodec == 'wav':
4031 more_opts += ['-f', 'wav']
4033 (prefix, ext) = os.path.splitext(path)
4034 new_path = prefix + '.' + extension
4035 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4037 self.run_ffmpeg(path, new_path, acodec, more_opts)
4039 etype,e,tb = sys.exc_info()
4040 if isinstance(e, AudioConversionError):
4041 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4043 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4046 # Try to update the date time for extracted audio file.
4047 if information.get('filetime') is not None:
4049 os.utime(new_path, (time.time(), information['filetime']))
4051 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4053 if not self._keepvideo:
4056 except (IOError, OSError):
4057 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4060 information['filepath'] = new_path
4064 def updateSelf(downloader, filename):
4065 ''' Update the program file with the latest version from the repository '''
4066 # Note: downloader only used for options
4067 if not os.access(filename, os.W_OK):
4068 sys.exit('ERROR: no write permissions on %s' % filename)
4070 downloader.to_screen('Updating to latest version...')
4074 urlh = urllib.urlopen(UPDATE_URL)
4075 newcontent = urlh.read()
4077 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4078 if vmatch is not None and vmatch.group(1) == __version__:
4079 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4083 except (IOError, OSError), err:
4084 sys.exit('ERROR: unable to download latest version')
4087 outf = open(filename, 'wb')
4089 outf.write(newcontent)
4092 except (IOError, OSError), err:
4093 sys.exit('ERROR: unable to overwrite current version')
4095 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4103 def _readOptions(filename):
4105 optionf = open(filename)
4107 return [] # silently skip if file is not present
4111 res += shlex.split(l, comments=True)
4116 def _format_option_string(option):
4117 ''' ('-o', '--option') -> -o, --format METAVAR'''
4121 if option._short_opts: opts.append(option._short_opts[0])
4122 if option._long_opts: opts.append(option._long_opts[0])
4123 if len(opts) > 1: opts.insert(1, ', ')
4125 if option.takes_value(): opts.append(' %s' % option.metavar)
4127 return "".join(opts)
4129 def _find_term_columns():
4130 columns = os.environ.get('COLUMNS', None)
4135 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4136 out,err = sp.communicate()
4137 return int(out.split()[1])
4143 max_help_position = 80
4145 # No need to wrap help messages if we're on a wide console
4146 columns = _find_term_columns()
4147 if columns: max_width = columns
4149 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4150 fmt.format_option_strings = _format_option_string
4153 'version' : __version__,
4155 'usage' : '%prog [options] url [url...]',
4156 'conflict_handler' : 'resolve',
4159 parser = optparse.OptionParser(**kw)
4162 general = optparse.OptionGroup(parser, 'General Options')
4163 selection = optparse.OptionGroup(parser, 'Video Selection')
4164 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4165 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4166 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4167 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4168 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4170 general.add_option('-h', '--help',
4171 action='help', help='print this help text and exit')
4172 general.add_option('-v', '--version',
4173 action='version', help='print program version and exit')
4174 general.add_option('-U', '--update',
4175 action='store_true', dest='update_self', help='update this program to latest version')
4176 general.add_option('-i', '--ignore-errors',
4177 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4178 general.add_option('-r', '--rate-limit',
4179 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4180 general.add_option('-R', '--retries',
4181 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4182 general.add_option('--dump-user-agent',
4183 action='store_true', dest='dump_user_agent',
4184 help='display the current browser identification', default=False)
4185 general.add_option('--list-extractors',
4186 action='store_true', dest='list_extractors',
4187 help='List all supported extractors and the URLs they would handle', default=False)
4189 selection.add_option('--playlist-start',
4190 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4191 selection.add_option('--playlist-end',
4192 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4193 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4194 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4195 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4197 authentication.add_option('-u', '--username',
4198 dest='username', metavar='USERNAME', help='account username')
4199 authentication.add_option('-p', '--password',
4200 dest='password', metavar='PASSWORD', help='account password')
4201 authentication.add_option('-n', '--netrc',
4202 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4205 video_format.add_option('-f', '--format',
4206 action='store', dest='format', metavar='FORMAT', help='video format code')
4207 video_format.add_option('--all-formats',
4208 action='store_const', dest='format', help='download all available video formats', const='all')
4209 video_format.add_option('--prefer-free-formats',
4210 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4211 video_format.add_option('--max-quality',
4212 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4213 video_format.add_option('-F', '--list-formats',
4214 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4217 verbosity.add_option('-q', '--quiet',
4218 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4219 verbosity.add_option('-s', '--simulate',
4220 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4221 verbosity.add_option('--skip-download',
4222 action='store_true', dest='skip_download', help='do not download the video', default=False)
4223 verbosity.add_option('-g', '--get-url',
4224 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4225 verbosity.add_option('-e', '--get-title',
4226 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4227 verbosity.add_option('--get-thumbnail',
4228 action='store_true', dest='getthumbnail',
4229 help='simulate, quiet but print thumbnail URL', default=False)
4230 verbosity.add_option('--get-description',
4231 action='store_true', dest='getdescription',
4232 help='simulate, quiet but print video description', default=False)
4233 verbosity.add_option('--get-filename',
4234 action='store_true', dest='getfilename',
4235 help='simulate, quiet but print output filename', default=False)
4236 verbosity.add_option('--get-format',
4237 action='store_true', dest='getformat',
4238 help='simulate, quiet but print output format', default=False)
4239 verbosity.add_option('--no-progress',
4240 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4241 verbosity.add_option('--console-title',
4242 action='store_true', dest='consoletitle',
4243 help='display progress in console titlebar', default=False)
4246 filesystem.add_option('-t', '--title',
4247 action='store_true', dest='usetitle', help='use title in file name', default=False)
4248 filesystem.add_option('-l', '--literal',
4249 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4250 filesystem.add_option('-A', '--auto-number',
4251 action='store_true', dest='autonumber',
4252 help='number downloaded files starting from 00000', default=False)
4253 filesystem.add_option('-o', '--output',
4254 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4255 filesystem.add_option('-a', '--batch-file',
4256 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4257 filesystem.add_option('-w', '--no-overwrites',
4258 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4259 filesystem.add_option('-c', '--continue',
4260 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4261 filesystem.add_option('--no-continue',
4262 action='store_false', dest='continue_dl',
4263 help='do not resume partially downloaded files (restart from beginning)')
4264 filesystem.add_option('--cookies',
4265 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4266 filesystem.add_option('--no-part',
4267 action='store_true', dest='nopart', help='do not use .part files', default=False)
4268 filesystem.add_option('--no-mtime',
4269 action='store_false', dest='updatetime',
4270 help='do not use the Last-modified header to set the file modification time', default=True)
4271 filesystem.add_option('--write-description',
4272 action='store_true', dest='writedescription',
4273 help='write video description to a .description file', default=False)
4274 filesystem.add_option('--write-info-json',
4275 action='store_true', dest='writeinfojson',
4276 help='write video metadata to a .info.json file', default=False)
4279 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4280 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4281 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4282 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4283 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4284 help='ffmpeg audio bitrate specification, 128k by default')
4285 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4286 help='keeps the video file on disk after the post-processing; the video is erased by default')
4289 parser.add_option_group(general)
4290 parser.add_option_group(selection)
4291 parser.add_option_group(filesystem)
4292 parser.add_option_group(verbosity)
4293 parser.add_option_group(video_format)
4294 parser.add_option_group(authentication)
4295 parser.add_option_group(postproc)
4297 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4299 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4301 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4302 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4303 opts, args = parser.parse_args(argv)
4305 return parser, opts, args
4307 def gen_extractors():
4308 """ Return a list of an instance of every supported extractor.
4309 The order does matter; the first extractor matched is the one handling the URL.
4311 youtube_ie = YoutubeIE()
4312 google_ie = GoogleIE()
4313 yahoo_ie = YahooIE()
4315 YoutubePlaylistIE(youtube_ie),
4316 YoutubeUserIE(youtube_ie),
4317 YoutubeSearchIE(youtube_ie),
4319 MetacafeIE(youtube_ie),
4322 GoogleSearchIE(google_ie),
4325 YahooSearchIE(yahoo_ie),
4338 StanfordOpenClassroomIE(),
4344 parser, opts, args = parseOpts()
4346 # Open appropriate CookieJar
4347 if opts.cookiefile is None:
4348 jar = cookielib.CookieJar()
4351 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4352 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4354 except (IOError, OSError), err:
4355 sys.exit(u'ERROR: unable to open cookie file')
4358 if opts.dump_user_agent:
4359 print std_headers['User-Agent']
4362 # Batch file verification
4364 if opts.batchfile is not None:
4366 if opts.batchfile == '-':
4369 batchfd = open(opts.batchfile, 'r')
4370 batchurls = batchfd.readlines()
4371 batchurls = [x.strip() for x in batchurls]
4372 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4374 sys.exit(u'ERROR: batch file could not be read')
4375 all_urls = batchurls + args
4377 # General configuration
4378 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4379 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4380 urllib2.install_opener(opener)
4381 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4383 extractors = gen_extractors()
4385 if opts.list_extractors:
4386 for ie in extractors:
4388 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4389 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4390 for mu in matchedUrls:
4394 # Conflicting, missing and erroneous options
4395 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4396 parser.error(u'using .netrc conflicts with giving username/password')
4397 if opts.password is not None and opts.username is None:
4398 parser.error(u'account username missing')
4399 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4400 parser.error(u'using output template conflicts with using title, literal title or auto number')
4401 if opts.usetitle and opts.useliteral:
4402 parser.error(u'using title conflicts with using literal title')
4403 if opts.username is not None and opts.password is None:
4404 opts.password = getpass.getpass(u'Type account password and press return:')
4405 if opts.ratelimit is not None:
4406 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4407 if numeric_limit is None:
4408 parser.error(u'invalid rate limit specified')
4409 opts.ratelimit = numeric_limit
4410 if opts.retries is not None:
4412 opts.retries = long(opts.retries)
4413 except (TypeError, ValueError), err:
4414 parser.error(u'invalid retry count specified')
4416 opts.playliststart = int(opts.playliststart)
4417 if opts.playliststart <= 0:
4418 raise ValueError(u'Playlist start must be positive')
4419 except (TypeError, ValueError), err:
4420 parser.error(u'invalid playlist start number specified')
4422 opts.playlistend = int(opts.playlistend)
4423 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4424 raise ValueError(u'Playlist end must be greater than playlist start')
4425 except (TypeError, ValueError), err:
4426 parser.error(u'invalid playlist end number specified')
4427 if opts.extractaudio:
4428 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4429 parser.error(u'invalid audio format specified')
4432 fd = FileDownloader({
4433 'usenetrc': opts.usenetrc,
4434 'username': opts.username,
4435 'password': opts.password,
4436 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4437 'forceurl': opts.geturl,
4438 'forcetitle': opts.gettitle,
4439 'forcethumbnail': opts.getthumbnail,
4440 'forcedescription': opts.getdescription,
4441 'forcefilename': opts.getfilename,
4442 'forceformat': opts.getformat,
4443 'simulate': opts.simulate,
4444 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4445 'format': opts.format,
4446 'format_limit': opts.format_limit,
4447 'listformats': opts.listformats,
4448 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4449 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4450 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4451 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4452 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4453 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4454 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4455 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4456 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4457 or u'%(id)s.%(ext)s'),
4458 'ignoreerrors': opts.ignoreerrors,
4459 'ratelimit': opts.ratelimit,
4460 'nooverwrites': opts.nooverwrites,
4461 'retries': opts.retries,
4462 'continuedl': opts.continue_dl,
4463 'noprogress': opts.noprogress,
4464 'playliststart': opts.playliststart,
4465 'playlistend': opts.playlistend,
4466 'logtostderr': opts.outtmpl == '-',
4467 'consoletitle': opts.consoletitle,
4468 'nopart': opts.nopart,
4469 'updatetime': opts.updatetime,
4470 'writedescription': opts.writedescription,
4471 'writeinfojson': opts.writeinfojson,
4472 'matchtitle': opts.matchtitle,
4473 'rejecttitle': opts.rejecttitle,
4474 'max_downloads': opts.max_downloads,
4475 'prefer_free_formats': opts.prefer_free_formats,
4477 for extractor in extractors:
4478 fd.add_info_extractor(extractor)
4481 if opts.extractaudio:
4482 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4485 if opts.update_self:
4486 updateSelf(fd, sys.argv[0])
4489 if len(all_urls) < 1:
4490 if not opts.update_self:
4491 parser.error(u'you must provide at least one URL')
4496 retcode = fd.download(all_urls)
4497 except MaxDownloadsReached:
4498 fd.to_screen(u'--max-download limit reached, aborting.')
4501 # Dump cookie jar if requested
4502 if opts.cookiefile is not None:
4505 except (IOError, OSError), err:
4506 sys.exit(u'ERROR: unable to save cookie jar')
4513 except DownloadError:
4515 except SameFileError:
4516 sys.exit(u'ERROR: fixed output name but more than one file to download')
4517 except KeyboardInterrupt:
4518 sys.exit(u'\nERROR: Interrupted by user')
4520 if __name__ == '__main__':
4523 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: