[pornhd] Fix extraction (fixes #4915)

[youtube-dl.git] / youtube_dl / YoutubeDL.py
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index ae62432c66d04dbc2bbf8027f9919cedb2a28af7..dda222feef201eefd642cb5a6fd9f4d79b983648 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -25,6 +25,7 @@ if os.name == 'nt':
      import ctypes
  
  from .compat import (
      import ctypes
  
  from .compat import (
+    compat_basestring,
      compat_cookiejar,
      compat_expanduser,
      compat_http_client,
      compat_cookiejar,
      compat_expanduser,
      compat_http_client,
@@ -227,6 +228,11 @@ class YoutubeDL(object):
      external_downloader:  Executable of the external downloader to call.
      listformats:       Print an overview of available video formats and exit.
      list_thumbnails:   Print a table of all thumbnails and exit.
      external_downloader:  Executable of the external downloader to call.
      listformats:       Print an overview of available video formats and exit.
      list_thumbnails:   Print a table of all thumbnails and exit.
+    match_filter:      A function that gets called with the info_dict of
+                       every video.
+                       If it returns a message, the video is ignored.
+                       If it returns None, the video is downloaded.
+                       match_filter_func in utils.py is one example for this.
  
  
      The following parameters are not used by YoutubeDL itself, they are used by
  
  
      The following parameters are not used by YoutubeDL itself, they are used by
@@ -543,6 +549,11 @@ class YoutubeDL(object):
              outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
              tmpl = compat_expanduser(outtmpl)
              filename = tmpl % template_dict
              outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
              tmpl = compat_expanduser(outtmpl)
              filename = tmpl % template_dict
+            # Temporary fix for #4787
+            # 'Treat' all problem characters by passing filename through preferredencoding
+            # to workaround encoding issues with subprocess on python2 @ Windows
+            if sys.version_info < (3, 0) and sys.platform == 'win32':
+                filename = encodeFilename(filename, True).decode(preferredencoding())
              return filename
          except ValueError as err:
              self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
              return filename
          except ValueError as err:
              self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
@@ -577,9 +588,16 @@ class YoutubeDL(object):
              if max_views is not None and view_count > max_views:
                  return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
          if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
              if max_views is not None and view_count > max_views:
                  return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
          if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
-            return 'Skipping "%s" because it is age restricted' % title
+            return 'Skipping "%s" because it is age restricted' % video_title
          if self.in_download_archive(info_dict):
              return '%s has already been recorded in archive' % video_title
          if self.in_download_archive(info_dict):
              return '%s has already been recorded in archive' % video_title
+
+        match_filter = self.params.get('match_filter')
+        if match_filter is not None:
+            ret = match_filter(info_dict)
+            if ret is not None:
+                return ret
+
          return None
  
      @staticmethod
          return None
  
      @staticmethod
@@ -820,26 +838,43 @@ class YoutubeDL(object):
              '!=': operator.ne,
          }
          operator_rex = re.compile(r'''(?x)\s*\[
              '!=': operator.ne,
          }
          operator_rex = re.compile(r'''(?x)\s*\[
-            (?P<key>width|height|tbr|abr|vbr|filesize)
+            (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
              \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
              (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
              \]$
              ''' % '|'.join(map(re.escape, OPERATORS.keys())))
          m = operator_rex.search(format_spec)
              \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
              (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
              \]$
              ''' % '|'.join(map(re.escape, OPERATORS.keys())))
          m = operator_rex.search(format_spec)
+        if m:
+            try:
+                comparison_value = int(m.group('value'))
+            except ValueError:
+                comparison_value = parse_filesize(m.group('value'))
+                if comparison_value is None:
+                    comparison_value = parse_filesize(m.group('value') + 'B')
+                if comparison_value is None:
+                    raise ValueError(
+                        'Invalid value %r in format specification %r' % (
+                            m.group('value'), format_spec))
+            op = OPERATORS[m.group('op')]
+
          if not m:
          if not m:
-            raise ValueError('Invalid format specification %r' % format_spec)
+            STR_OPERATORS = {
+                '=': operator.eq,
+                '!=': operator.ne,
+            }
+            str_operator_rex = re.compile(r'''(?x)\s*\[
+                \s*(?P<key>ext|acodec|vcodec|container|protocol)
+                \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
+                \s*(?P<value>[a-zA-Z0-9_-]+)
+                \s*\]$
+                ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
+            m = str_operator_rex.search(format_spec)
+            if m:
+                comparison_value = m.group('value')
+                op = STR_OPERATORS[m.group('op')]
  
  
-        try:
-            comparison_value = int(m.group('value'))
-        except ValueError:
-            comparison_value = parse_filesize(m.group('value'))
-            if comparison_value is None:
-                comparison_value = parse_filesize(m.group('value') + 'B')
-            if comparison_value is None:
-                raise ValueError(
-                    'Invalid value %r in format specification %r' % (
-                        m.group('value'), format_spec))
-        op = OPERATORS[m.group('op')]
+        if not m:
+            raise ValueError('Invalid format specification %r' % format_spec)
  
          def _filter(f):
              actual_value = f.get(m.group('key'))
  
          def _filter(f):
              actual_value = f.get(m.group('key'))
@@ -932,6 +967,9 @@ class YoutubeDL(object):
              def has_header(self, h):
                  return h in self.headers
  
              def has_header(self, h):
                  return h in self.headers
  
+            def get_header(self, h, default=None):
+                return self.headers.get(h, default)
+
          pr = _PseudoRequest(info_dict['url'])
          self.cookiejar.add_cookie_header(pr)
          return pr.headers.get('Cookie')
          pr = _PseudoRequest(info_dict['url'])
          self.cookiejar.add_cookie_header(pr)
          return pr.headers.get('Cookie')
@@ -953,14 +991,16 @@ class YoutubeDL(object):
          if thumbnails is None:
              thumbnail = info_dict.get('thumbnail')
              if thumbnail:
          if thumbnails is None:
              thumbnail = info_dict.get('thumbnail')
              if thumbnail:
-                thumbnails = [{'url': thumbnail}]
+                info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
          if thumbnails:
              thumbnails.sort(key=lambda t: (
                  t.get('preference'), t.get('width'), t.get('height'),
                  t.get('id'), t.get('url')))
          if thumbnails:
              thumbnails.sort(key=lambda t: (
                  t.get('preference'), t.get('width'), t.get('height'),
                  t.get('id'), t.get('url')))
-            for t in thumbnails:
+            for i, t in enumerate(thumbnails):
                  if 'width' in t and 'height' in t:
                      t['resolution'] = '%dx%d' % (t['width'], t['height'])
                  if 'width' in t and 'height' in t:
                      t['resolution'] = '%dx%d' % (t['width'], t['height'])
+                if t.get('id') is None:
+                    t['id'] = '%d' % i
  
          if thumbnails and 'thumbnail' not in info_dict:
              info_dict['thumbnail'] = thumbnails[-1]['url']
  
          if thumbnails and 'thumbnail' not in info_dict:
              info_dict['thumbnail'] = thumbnails[-1]['url']
@@ -1068,8 +1108,10 @@ class YoutubeDL(object):
                                  else self.params['merge_output_format'])
                              selected_format = {
                                  'requested_formats': formats_info,
                                  else self.params['merge_output_format'])
                              selected_format = {
                                  'requested_formats': formats_info,
-                                'format': rf,
-                                'ext': formats_info[0]['ext'],
+                                'format': '%s+%s' % (formats_info[0].get('format'),
+                                                     formats_info[1].get('format')),
+                                'format_id': '%s+%s' % (formats_info[0].get('format_id'),
+                                                        formats_info[1].get('format_id')),
                                  'width': formats_info[0].get('width'),
                                  'height': formats_info[0].get('height'),
                                  'resolution': formats_info[0].get('resolution'),
                                  'width': formats_info[0].get('width'),
                                  'height': formats_info[0].get('height'),
                                  'resolution': formats_info[0].get('resolution'),
@@ -1516,7 +1558,6 @@ class YoutubeDL(object):
              line(f, idlen) for f in formats
              if f.get('preference') is None or f['preference'] >= -1000]
          if len(formats) > 1:
              line(f, idlen) for f in formats
              if f.get('preference') is None or f['preference'] >= -1000]
          if len(formats) > 1:
-            formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
              formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
  
          header_line = line({
              formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
  
          header_line = line({
@@ -1552,7 +1593,7 @@ class YoutubeDL(object):
          # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
          # To work around aforementioned issue we will replace request's original URL with
          # percent-encoded one
          # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
          # To work around aforementioned issue we will replace request's original URL with
          # percent-encoded one
-        req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str)
+        req_is_string = isinstance(req, compat_basestring)
          url = req if req_is_string else req.get_full_url()
          url_escaped = escape_url(url)
  
          url = req if req_is_string else req.get_full_url()
          url_escaped = escape_url(url)