New optoin --restrict-filenames

author Philipp Hagemeister <phihag@phihag.de>

Mon, 26 Nov 2012 22:58:46 +0000 (23:58 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Mon, 26 Nov 2012 22:58:46 +0000 (23:58 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Mon, 26 Nov 2012 22:58:46 +0000 (23:58 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Mon, 26 Nov 2012 22:58:46 +0000 (23:58 +0100)
diff --git a/README.md b/README.md

index e267760d624601a29fcdf5d814ab8ff32511af8f..14acddbd00cc9df81c5b444fc982f4f3f6ed64c7 100644 (file)
--- a/README.md
+++ b/README.md
@@ -47,6 +47,8 @@ which means you can modify it, redistribute it or use it however you like.
                               %(extractor)s for the provider (youtube, metacafe,
                               etc), %(id)s for the video id and %% for a literal
                               percent. Use - to output to stdout.
+    --restrict-filenames     Avoid some characters such as "&" and spaces in
+                             filenames
      -a, --batch-file FILE    file containing URLs to download ('-' for stdin)
      -w, --no-overwrites      do not overwrite files
      -c, --continue           resume partially downloaded files
diff --git a/test/test_utils.py b/test/test_utils.py

index e7c6d5b3d73342cf3b5544f287d358e32b9dde9d..0a435ddc547ef28a5aab9444ce466c4ccd75beaa 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -30,11 +30,34 @@ class TestUtil(unittest.TestCase):
                 self.assertEqual(u'yes no', sanitize_filename(u'yes? no'))
                 self.assertEqual(u'this - that', sanitize_filename(u'this: that'))
  
+               self.assertEqual(sanitize_filename(u'AT&T'), u'AT&T')
                 self.assertEqual(sanitize_filename(u'ä'), u'ä')
                 self.assertEqual(sanitize_filename(u'кириллица'), u'кириллица')
  
-               for forbidden in u'"\0\\/':
-                       self.assertTrue(forbidden not in sanitize_filename(forbidden))
+               forbidden = u'"\0\\/'
+               for fc in forbidden:
+                       for fbc in forbidden:
+                               self.assertTrue(fbc not in sanitize_filename(fc))
+
+       def test_sanitize_filename_restricted(self):
+               self.assertEqual(sanitize_filename(u'abc', restricted=True), u'abc')
+               self.assertEqual(sanitize_filename(u'abc_d-e', restricted=True), u'abc_d-e')
+
+               self.assertEqual(sanitize_filename(u'123', restricted=True), u'123')
+
+               self.assertEqual(u'abc-de', sanitize_filename(u'abc/de', restricted=True))
+               self.assertFalse(u'/' in sanitize_filename(u'abc/de///', restricted=True))
+
+               self.assertEqual(u'abc-de', sanitize_filename(u'abc/<>\\*|de', restricted=True))
+               self.assertEqual(u'xxx', sanitize_filename(u'xxx/<>\\*|', restricted=True))
+               self.assertEqual(u'yes_no', sanitize_filename(u'yes? no', restricted=True))
+               self.assertEqual(u'this_-_that', sanitize_filename(u'this: that', restricted=True))
+
+               forbidden = u'"\0\\/&: \'\t\n'
+               for fc in forbidden:
+                       print('input: ' + fc + ', result: ' + repr(sanitize_filename(fc, restricted=True)))
+                       for fbc in forbidden:
+                               self.assertTrue(fbc not in sanitize_filename(fc, restricted=True))
  
         def test_ordered_set(self):
                 self.assertEqual(orderedSet([1,1,2,3,4,4,5,6,7,3,5]), [1,2,3,4,5,6,7])
diff --git a/youtube-dl.1 b/youtube-dl.1

index cfaefd0c8d2760be0e1535b25838afc22f9d1bbb..64120a8d24a3f397108d9433400d8b6e0f2e3a4d 100644 (file)
--- a/youtube-dl.1
+++ b/youtube-dl.1
@@ -59,6 +59,8 @@ redistribute it or use it however you like.
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ %(extractor)s\ for\ the\ provider\ (youtube,\ metacafe,
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ etc),\ %(id)s\ for\ the\ video\ id\ and\ %%\ for\ a\ literal
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ percent.\ Use\ -\ to\ output\ to\ stdout.
+--restrict-filenames\ \ \ \ \ Avoid\ some\ characters\ such\ as\ "&"\ and\ spaces\ in
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ filenames
  -a,\ --batch-file\ FILE\ \ \ \ file\ containing\ URLs\ to\ download\ (\[aq]-\[aq]\ for\ stdin)
  -w,\ --no-overwrites\ \ \ \ \ \ do\ not\ overwrite\ files
  -c,\ --continue\ \ \ \ \ \ \ \ \ \ \ resume\ partially\ downloaded\ files
@@ -210,7 +212,7 @@ Please note that Python 2.5 is not supported anymore.
  .PP
  Since June 2012 (#342) youtube-dl is packed as an executable zipfile,
  simply unzip it (might need renaming to \f[C]youtube-dl.zip\f[] first on
-some systems) or clone the git repo to see the code.
+some systems) or clone the git repository, as laid out above.
  If you modify the code, you can run it by executing the
  \f[C]__main__.py\f[] file.
  To recompile the executable, run \f[C]make\ youtube-dl\f[].
diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion

index 76451a2b271db89de25de565fa203a64bc802620..dee191cd41b1e069cc907f113a441fd55de85dd0 100644 (file)
--- a/youtube-dl.bash-completion
+++ b/youtube-dl.bash-completion
@@ -3,7 +3,7 @@ __youtube-dl()
      local cur prev opts
      COMPREPLY=()
      cur="${COMP_WORDS[COMP_CWORD]}"
-    opts="--all-formats --audio-format --audio-quality --auto-number --batch-file --console-title --continue --cookies --dump-user-agent --extract-audio --format --get-description --get-filename --get-format --get-thumbnail --get-title --get-url --help --id --ignore-errors --keep-video --list-extractors --list-formats --literal --match-title --max-downloads --max-quality --netrc --no-continue --no-mtime --no-overwrites --no-part --no-progress --output --password --playlist-end --playlist-start --prefer-free-formats --quiet --rate-limit --reject-title --retries --simulate --skip-download --srt-lang --title --update --user-agent --username --verbose --version --write-description --write-info-json --write-srt"
+    opts="--all-formats --audio-format --audio-quality --auto-number --batch-file --console-title --continue --cookies --dump-user-agent --extract-audio --format --get-description --get-filename --get-format --get-thumbnail --get-title --get-url --help --id --ignore-errors --keep-video --list-extractors --list-formats --literal --match-title --max-downloads --max-quality --netrc --no-continue --no-mtime --no-overwrites --no-part --no-progress --output --password --playlist-end --playlist-start --prefer-free-formats --quiet --rate-limit --reject-title --restrict-filenames --retries --simulate --skip-download --srt-lang --title --update --user-agent --username --verbose --version --write-description --write-info-json --write-srt"
  
      if [[ ${cur} == * ]] ; then
          COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py

index 37a842cdd1df5ee92da48fddb2e552e0bf807bad..4c79be4325b3157942956cffbf1e930fa22550d3 100644 (file)
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -44,37 +44,38 @@ class FileDownloader(object):
  
         Available options:
  
-       username:         Username for authentication purposes.
-       password:         Password for authentication purposes.
-       usenetrc:         Use netrc for authentication instead.
-       quiet:            Do not print messages to stdout.
-       forceurl:         Force printing final URL.
-       forcetitle:       Force printing title.
-       forcethumbnail:   Force printing thumbnail URL.
-       forcedescription: Force printing description.
-       forcefilename:    Force printing final filename.
-       simulate:         Do not download the video files.
-       format:           Video format code.
-       format_limit:     Highest quality format to try.
-       outtmpl:          Template for output names.
-       ignoreerrors:     Do not stop on download errors.
-       ratelimit:        Download speed limit, in bytes/sec.
-       nooverwrites:     Prevent overwriting files.
-       retries:          Number of times to retry for HTTP error 5xx
-       continuedl:       Try to continue downloads if possible.
-       noprogress:       Do not print the progress bar.
-       playliststart:    Playlist item to start at.
-       playlistend:      Playlist item to end at.
-       matchtitle:       Download only matching titles.
-       rejecttitle:      Reject downloads for matching titles.
-       logtostderr:      Log messages to stderr instead of stdout.
-       consoletitle:     Display progress in console window's titlebar.
-       nopart:           Do not use temporary .part files.
-       updatetime:       Use the Last-modified header to set output file timestamps.
-       writedescription: Write the video description to a .description file
-       writeinfojson:    Write the video description to a .info.json file
-       writesubtitles:   Write the video subtitles to a .srt file
-       subtitleslang:    Language of the subtitles to download
+       username:          Username for authentication purposes.
+       password:          Password for authentication purposes.
+       usenetrc:          Use netrc for authentication instead.
+       quiet:             Do not print messages to stdout.
+       forceurl:          Force printing final URL.
+       forcetitle:        Force printing title.
+       forcethumbnail:    Force printing thumbnail URL.
+       forcedescription:  Force printing description.
+       forcefilename:     Force printing final filename.
+       simulate:          Do not download the video files.
+       format:            Video format code.
+       format_limit:      Highest quality format to try.
+       outtmpl:           Template for output names.
+       restrictfilenames: Do not allow "&" and spaces in file names
+       ignoreerrors:      Do not stop on download errors.
+       ratelimit:         Download speed limit, in bytes/sec.
+       nooverwrites:      Prevent overwriting files.
+       retries:           Number of times to retry for HTTP error 5xx
+       continuedl:        Try to continue downloads if possible.
+       noprogress:        Do not print the progress bar.
+       playliststart:     Playlist item to start at.
+       playlistend:       Playlist item to end at.
+       matchtitle:        Download only matching titles.
+       rejecttitle:       Reject downloads for matching titles.
+       logtostderr:       Log messages to stderr instead of stdout.
+       consoletitle:      Display progress in console window's titlebar.
+       nopart:            Do not use temporary .part files.
+       updatetime:        Use the Last-modified header to set output file timestamps.
+       writedescription:  Write the video description to a .description file
+       writeinfojson:     Write the video description to a .info.json file
+       writesubtitles:    Write the video subtitles to a .srt file
+       subtitleslang:     Language of the subtitles to download
         """
  
         params = None
@@ -349,7 +350,7 @@ class FileDownloader(object):
         def process_info(self, info_dict):
                 """Process a single dictionary returned by an InfoExtractor."""
  
-               info_dict['stitle'] = sanitize_filename(info_dict['title'])
+               info_dict['stitle'] = sanitize_filename(info_dict['title'], self.params.get('restrictfilenames'))
  
                 reason = self._match_entry(info_dict)
                 if reason is not None:
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index bf4b55f4817514c1bceca9d17fe90e04f8587681..1109e05cd4a390373b5b81203f3ebf36a2600b41 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -272,6 +272,9 @@ def parseOpts():
                         help='number downloaded files starting from 00000', default=False)
         filesystem.add_option('-o', '--output',
                         dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(title)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), %(extractor)s for the provider (youtube, metacafe, etc), %(id)s for the video id and %% for a literal percent. Use - to output to stdout.')
+       filesystem.add_option('--restrict-filenames',
+                       action='store_true', dest='restrictfilenames',
+                       help='Avoid some characters such as "&" and spaces in filenames', default=False)
         filesystem.add_option('-a', '--batch-file',
                         dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
         filesystem.add_option('-w', '--no-overwrites',
@@ -485,6 +488,7 @@ def _real_main():
                         or (opts.useid and u'%(id)s.%(ext)s')
                         or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
                         or u'%(id)s.%(ext)s'),
+               'restrictfilenames': opts.restrictfilenames,
                 'ignoreerrors': opts.ignoreerrors,
                 'ratelimit': opts.ratelimit,
                 'nooverwrites': opts.nooverwrites,
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 658fd2686b8534e834c34872883fd26d3d820cde..55f2fe02c0dc204ed6036bdab205896679699f71 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -194,18 +194,22 @@ def timeconvert(timestr):
         if timetuple is not None:
                 timestamp = email.utils.mktime_tz(timetuple)
         return timestamp
-       
-def sanitize_filename(s):
-       """Sanitizes a string so it could be used as part of a filename."""
+
+def sanitize_filename(s, restricted=False):
+       """Sanitizes a string so it could be used as part of a filename.
+       If restricted is set, use a stricter subset of allowed characters.
+       """
         def replace_insane(char):
                 if char == '?' or ord(char) < 32 or ord(char) == 127:
                         return ''
                 elif char == '"':
-                       return '\''
+                       return '' if restricted else 'FOO\''
                 elif char == ':':
-                       return ' -'
+                       return '_-' if restricted else ' -'
                 elif char in '\\/|*<>':
                         return '-'
+               if restricted and (char in '&\'' or char.isspace()):
+                       return '_'
                 return char
  
         result = u''.join(map(replace_insane, s))
author	Philipp Hagemeister <phihag@phihag.de>
	Mon, 26 Nov 2012 22:58:46 +0000 (23:58 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Mon, 26 Nov 2012 22:58:46 +0000 (23:58 +0100)
README.md		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
youtube-dl.1		patch \| blob \| history
youtube-dl.bash-completion		patch \| blob \| history
youtube_dl/FileDownloader.py		patch \| blob \| history
youtube_dl/__init__.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history