- """Transforms an HTML entity to a character.
-
- This function receives a match object and is intended to be used with
- the re.sub() function.
- """
- entity = matchobj.group(1)
-
- # Known non-numeric HTML entity
- if entity in htmlentitydefs.name2codepoint:
- return unichr(htmlentitydefs.name2codepoint[entity])
-
- mobj = re.match(ur'(?u)#(x?\d+)', entity)
- if mobj is not None:
- numstr = mobj.group(1)
- if numstr.startswith(u'x'):
- base = 16
- numstr = u'0%s' % numstr
- else:
- base = 10
- return unichr(int(numstr, base))
-
- # Unknown entity in name, return its literal representation
- return (u'&%s;' % entity)
-
-HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class IDParser(HTMLParser.HTMLParser):
- """Modified HTMLParser that isolates a tag with the specified id"""
- def __init__(self, id):
- self.id = id
- self.result = None
- self.started = False
- self.depth = {}
- self.html = None
- self.watch_startpos = False
- self.error_count = 0
- HTMLParser.HTMLParser.__init__(self)
-
- def error(self, message):
- if self.error_count > 10 or self.started:
- raise HTMLParser.HTMLParseError(message, self.getpos())
- self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
- self.error_count += 1
- self.goahead(1)
-
- def loads(self, html):
- self.html = html
- self.feed(html)
- self.close()
-
- def handle_starttag(self, tag, attrs):
- attrs = dict(attrs)
- if self.started:
- self.find_startpos(None)
- if 'id' in attrs and attrs['id'] == self.id:
- self.result = [tag]
- self.started = True
- self.watch_startpos = True
- if self.started:
- if not tag in self.depth: self.depth[tag] = 0
- self.depth[tag] += 1
-
- def handle_endtag(self, tag):
- if self.started:
- if tag in self.depth: self.depth[tag] -= 1
- if self.depth[self.result[0]] == 0:
- self.started = False
- self.result.append(self.getpos())
-
- def find_startpos(self, x):
- """Needed to put the start position of the result (self.result[1])
- after the opening tag with the requested id"""
- if self.watch_startpos:
- self.watch_startpos = False
- self.result.append(self.getpos())
- handle_entityref = handle_charref = handle_data = handle_comment = \
- handle_decl = handle_pi = unknown_decl = find_startpos
-
- def get_result(self):
- if self.result is None:
- return None
- if len(self.result) != 3:
- return None
- lines = self.html.split('\n')
- lines = lines[self.result[1][0]-1:self.result[2][0]]
- lines[0] = lines[0][self.result[1][1]:]
- if len(lines) == 1:
- lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
- lines[-1] = lines[-1][:self.result[2][1]]
- return '\n'.join(lines).strip()
+ """Transforms an HTML entity to a character.
+
+ This function receives a match object and is intended to be used with
+ the re.sub() function.
+ """
+ entity = matchobj.group(1)
+
+ # Known non-numeric HTML entity
+ if entity in compat_html_entities.name2codepoint:
+ return compat_chr(compat_html_entities.name2codepoint[entity])
+
+ mobj = re.match(u'(?u)#(x?\\d+)', entity)
+ if mobj is not None:
+ numstr = mobj.group(1)
+ if numstr.startswith(u'x'):
+ base = 16
+ numstr = u'0%s' % numstr
+ else:
+ base = 10
+ return compat_chr(int(numstr, base))
+
+ # Unknown entity in name, return its literal representation
+ return (u'&%s;' % entity)
+
+compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
+class AttrParser(compat_html_parser.HTMLParser):
+ """Modified HTMLParser that isolates a tag with the specified attribute"""
+ def __init__(self, attribute, value):
+ self.attribute = attribute
+ self.value = value
+ self.result = None
+ self.started = False
+ self.depth = {}
+ self.html = None
+ self.watch_startpos = False
+ self.error_count = 0
+ compat_html_parser.HTMLParser.__init__(self)
+
+ def error(self, message):
+ if self.error_count > 10 or self.started:
+ raise compat_html_parser.HTMLParseError(message, self.getpos())
+ self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
+ self.error_count += 1
+ self.goahead(1)
+
+ def loads(self, html):
+ self.html = html
+ self.feed(html)
+ self.close()
+
+ def handle_starttag(self, tag, attrs):
+ attrs = dict(attrs)
+ if self.started:
+ self.find_startpos(None)
+ if self.attribute in attrs and attrs[self.attribute] == self.value:
+ self.result = [tag]
+ self.started = True
+ self.watch_startpos = True
+ if self.started:
+ if not tag in self.depth: self.depth[tag] = 0
+ self.depth[tag] += 1
+
+ def handle_endtag(self, tag):
+ if self.started:
+ if tag in self.depth: self.depth[tag] -= 1
+ if self.depth[self.result[0]] == 0:
+ self.started = False
+ self.result.append(self.getpos())
+
+ def find_startpos(self, x):
+ """Needed to put the start position of the result (self.result[1])
+ after the opening tag with the requested id"""
+ if self.watch_startpos:
+ self.watch_startpos = False
+ self.result.append(self.getpos())
+ handle_entityref = handle_charref = handle_data = handle_comment = \
+ handle_decl = handle_pi = unknown_decl = find_startpos
+
+ def get_result(self):
+ if self.result is None:
+ return None
+ if len(self.result) != 3:
+ return None
+ lines = self.html.split('\n')
+ lines = lines[self.result[1][0]-1:self.result[2][0]]
+ lines[0] = lines[0][self.result[1][1]:]
+ if len(lines) == 1:
+ lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+ lines[-1] = lines[-1][:self.result[2][1]]
+ return '\n'.join(lines).strip()
+# Hack for https://github.com/rg3/youtube-dl/issues/662
+if sys.version_info < (2, 7, 3):
+ AttrParser.parse_endtag = (lambda self, i:
+ i + len("</scr'+'ipt>")
+ if self.rawdata[i:].startswith("</scr'+'ipt>")
+ else compat_html_parser.HTMLParser.parse_endtag(self, i))