Work around buggy HTML Parser in Python < 2.7.3 (Closes #662)

[youtube-dl.git] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 463804e183117b23efd8f0e4b0ad9b132e519ddb..e6ce028d620e0c68952ffe18813cfb1a885beef7 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -8,6 +8,7 @@ import locale
  import os
  import re
  import sys
+import traceback
  import zlib
  import email.utils
  import json
@@ -154,6 +155,7 @@ std_headers = {
      'Accept-Encoding': 'gzip, deflate',
      'Accept-Language': 'en-us,en;q=0.5',
  }
+
  def preferredencoding():
      """Get preferred encoding.
  
@@ -187,7 +189,6 @@ else:
          with open(fn, 'w', encoding='utf-8') as f:
              json.dump(obj, f)
  
-
  def htmlentity_transform(matchobj):
      """Transforms an HTML entity to a character.
  
@@ -279,6 +280,12 @@ class AttrParser(compat_html_parser.HTMLParser):
              lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
          lines[-1] = lines[-1][:self.result[2][1]]
          return '\n'.join(lines).strip()
+# Hack for https://github.com/rg3/youtube-dl/issues/662
+if sys.version_info < (2, 7, 3):
+    AttrParser.parse_endtag = (lambda self, i:
+        i + len("</scr'+'ipt>")
+        if self.rawdata[i:].startswith("</scr'+'ipt>")
+        else compat_html_parser.HTMLParser.parse_endtag(self, i))
  
  def get_element_by_id(id, html):
      """Return the content of the tag with the specified ID in the passed HTML document"""
@@ -408,7 +415,24 @@ def encodeFilename(s):
          # match Windows 9x series as well. Besides, NT 4 is obsolete.)
          return s
      else:
-        return s.encode(sys.getfilesystemencoding(), 'ignore')
+        encoding = sys.getfilesystemencoding()
+        if encoding is None:
+            encoding = 'utf-8'
+        return s.encode(encoding, 'ignore')
+
+
+class ExtractorError(Exception):
+    """Error during info extraction."""
+    def __init__(self, msg, tb=None):
+        """ tb, if given, is the original traceback (so that it can be printed out). """
+        super(ExtractorError, self).__init__(msg)
+        self.traceback = tb
+
+    def format_traceback(self):
+        if self.traceback is None:
+            return None
+        return u''.join(traceback.format_tb(self.traceback))
+
  
  class DownloadError(Exception):
      """Download Error exception.
@@ -435,7 +459,8 @@ class PostProcessingError(Exception):
      This exception may be raised by PostProcessor's .run() method to
      indicate an error in the postprocessing task.
      """
-    pass
+    def __init__(self, msg):
+        self.msg = msg
  
  class MaxDownloadsReached(Exception):
      """ --max-downloads limit has been reached. """
@@ -500,14 +525,19 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
          return ret
  
      def http_request(self, req):
-        for h in std_headers:
+        for h,v in std_headers.items():
              if h in req.headers:
                  del req.headers[h]
-            req.add_header(h, std_headers[h])
+            req.add_header(h, v)
          if 'Youtubedl-no-compression' in req.headers:
              if 'Accept-encoding' in req.headers:
                  del req.headers['Accept-encoding']
              del req.headers['Youtubedl-no-compression']
+        if 'Youtubedl-user-agent' in req.headers:
+            if 'User-agent' in req.headers:
+                del req.headers['User-agent']
+            req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
+            del req.headers['Youtubedl-user-agent']
          return req
  
      def http_response(self, req, resp):