Lib/HTMLParser.py source code

"""A parser for HTML and XHTML."""
     2 
     3 # This file is based on sgmllib.py, but the API is slightly different.
     4 
     5 # XXX There should be a way to distinguish between PCDATA (parsed
     6 # character data -- the normal case), RCDATA (replaceable character
     7 # data -- only char and entity references and end tags are special)
     8 # and CDATA (character data -- only end tags are special).
     9 
    10 
    11 import markupbase
    12 import re
    13 
    14 # Regular expressions used for parsing
    15 
    16 interesting_normal = re.compile('[&<]')
    17 incomplete = re.compile('&[a-zA-Z#]')
    18 
    19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
    20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
    21 
    22 starttagopen = re.compile('<[a-zA-Z]')
    23 piclose = re.compile('>')
    24 commentclose = re.compile(r'--\s*>')
    25 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
    26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
    27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
    28 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
    29 
    30 attrfind = re.compile(
    31     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
    32     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
    33 
    34 locatestarttagend = re.compile(r"""
    35   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
    36   (?:[\s/]*                          # optional whitespace before attribute name
    37     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
    38       (?:\s*=+\s*                    # value indicator
    39         (?:'[^']*'                   # LITA-enclosed value
    40           |"[^"]*"                   # LIT-enclosed value
    41           |(?!['"])[^>\s]*           # bare value
    42          )
    43        )?(?:\s|/(?!>))*
    44      )*
    45    )?
    46   \s*                                # trailing whitespace
    47 """, re.VERBOSE)
    48 endendtag = re.compile('>')
    49 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
    50 # </ and the tag name, so maybe this should be fixed
    51 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
    52 
    53 
    54 class HTMLParseError(Exception):
    55     """Exception raised for all parse errors."""
    56 
    57     def __init__(self, msg, position=(None, None)):
    58         assert msg
    59         self.msg = msg
    60         self.lineno = position[0]
    61         self.offset = position[1]
    62 
    63     def __str__(self):
    64         result = self.msg
    65         if self.lineno is not None:
    66             result = result + ", at line %d" % self.lineno
    67         if self.offset is not None:
    68             result = result + ", column %d" % (self.offset + 1)
    69         return result
    70 
    71 
    72 class HTMLParser(markupbase.ParserBase):
    73     """Find tags and other markup and call handler functions.
    74 
    75     Usage:
    76         p = HTMLParser()
    77         p.feed(data)
    78         ...
    79         p.close()
    80 
    81     Start tags are handled by calling self.handle_starttag() or
    82     self.handle_startendtag(); end tags by self.handle_endtag().  The
    83     data between tags is passed from the parser to the derived class
    84     by calling self.handle_data() with the data as argument (the data
    85     may be split up in arbitrary chunks).  Entity references are
    86     passed by calling self.handle_entityref() with the entity
    87     reference as the argument.  Numeric character references are
    88     passed to self.handle_charref() with the string containing the
    89     reference as the argument.
    90     """
    91 
    92     CDATA_CONTENT_ELEMENTS = ("script", "style")
    93 
    94 
    95     def __init__(self):
    96         """Initialize and reset this instance."""
    97         self.reset()
    98 
    99     def reset(self):
   100         """Reset this instance.  Loses all unprocessed data."""
   101         self.rawdata = ''
   102         self.lasttag = '???'
   103         self.interesting = interesting_normal
   104         self.cdata_elem = None
   105         markupbase.ParserBase.reset(self)
   106 
   107     def feed(self, data):
   108         r"""Feed data to the parser.
   109 
   110         Call this as often as you want, with as little or as much text
   111         as you want (may include '\n').
   112         """
   113         self.rawdata = self.rawdata + data
   114         self.goahead(0)
   115 
   116     def close(self):
   117         """Handle any buffered data."""
   118         self.goahead(1)
   119 
   120     def error(self, message):
   121         raise HTMLParseError(message, self.getpos())
   122 
   123     __starttag_text = None
   124 
   125     def get_starttag_text(self):
   126         """Return full source of start tag: '<...>'."""
   127         return self.__starttag_text
   128 
   129     def set_cdata_mode(self, elem):
   130         self.cdata_elem = elem.lower()
   131         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
   132 
   133     def clear_cdata_mode(self):
   134         self.interesting = interesting_normal
   135         self.cdata_elem = None
   136 
   137     # Internal -- handle data as far as reasonable.  May leave state
   138     # and data to be processed by a subsequent call.  If 'end' is
   139     # true, force handling all data as if followed by EOF marker.
   140     def goahead(self, end):
   141         rawdata = self.rawdata
   142         i = 0
   143         n = len(rawdata)
   144         while i < n:
   145             match = self.interesting.search(rawdata, i) # < or &
   146             if match:
   147                 j = match.start()
   148             else:
   149                 if self.cdata_elem:
   150                     break
   151                 j = n
   152             if i < j: self.handle_data(rawdata[i:j])
   153             i = self.updatepos(i, j)
   154             if i == n: break
   155             startswith = rawdata.startswith
   156             if startswith('<', i):
   157                 if starttagopen.match(rawdata, i): # < + letter
   158                     k = self.parse_starttag(i)
   159                 elif startswith("</", i):
   160                     k = self.parse_endtag(i)
   161                 elif startswith("<!--", i):
   162                     k = self.parse_comment(i)
   163                 elif startswith("<?", i):
   164                     k = self.parse_pi(i)
   165                 elif startswith("<!", i):
   166                     k = self.parse_html_declaration(i)
   167                 elif (i + 1) < n:
   168                     self.handle_data("<")
   169                     k = i + 1
   170                 else:
   171                     break
   172                 if k < 0:
   173                     if not end:
   174                         break
   175                     k = rawdata.find('>', i + 1)
   176                     if k < 0:
   177                         k = rawdata.find('<', i + 1)
   178                         if k < 0:
   179                             k = i + 1
   180                     else:
   181                         k += 1
   182                     self.handle_data(rawdata[i:k])
   183                 i = self.updatepos(i, k)
   184             elif startswith("&#", i):
   185                 match = charref.match(rawdata, i)
   186                 if match:
   187                     name = match.group()[2:-1]
   188                     self.handle_charref(name)
   189                     k = match.end()
   190                     if not startswith(';', k-1):
   191                         k = k - 1
   192                     i = self.updatepos(i, k)
   193                     continue
   194                 else:
   195                     if ";" in rawdata[i:]: #bail by consuming &#
   196                         self.handle_data(rawdata[0:2])
   197                         i = self.updatepos(i, 2)
   198                     break
   199             elif startswith('&', i):
   200                 match = entityref.match(rawdata, i)
   201                 if match:
   202                     name = match.group(1)
   203                     self.handle_entityref(name)
   204                     k = match.end()
   205                     if not startswith(';', k-1):
   206                         k = k - 1
   207                     i = self.updatepos(i, k)
   208                     continue
   209                 match = incomplete.match(rawdata, i)
   210                 if match:
   211                     # match.group() will contain at least 2 chars
   212                     if end and match.group() == rawdata[i:]:
   213                         self.error("EOF in middle of entity or char ref")
   214                     # incomplete
   215                     break
   216                 elif (i + 1) < n:
   217                     # not the end of the buffer, and can't be confused
   218                     # with some other construct
   219                     self.handle_data("&")
   220                     i = self.updatepos(i, i + 1)
   221                 else:
   222                     break
   223             else:
   224                 assert 0, "interesting.search() lied"
   225         # end while
   226         if end and i < n and not self.cdata_elem:
   227             self.handle_data(rawdata[i:n])
   228             i = self.updatepos(i, n)
   229         self.rawdata = rawdata[i:]
   230 
   231     # Internal -- parse html declarations, return length or -1 if not terminated
   232     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
   233     # See also parse_declaration in _markupbase
   234     def parse_html_declaration(self, i):
   235         rawdata = self.rawdata
   236         if rawdata[i:i+2] != '<!':
   237             self.error('unexpected call to parse_html_declaration()')
   238         if rawdata[i:i+4] == '<!--':
   239             # this case is actually already handled in goahead()
   240             return self.parse_comment(i)
   241         elif rawdata[i:i+3] == '<![':
   242             return self.parse_marked_section(i)
   243         elif rawdata[i:i+9].lower() == '<!doctype':
   244             # find the closing >
   245             gtpos = rawdata.find('>', i+9)
   246             if gtpos == -1:
   247                 return -1
   248             self.handle_decl(rawdata[i+2:gtpos])
   249             return gtpos+1
   250         else:
   251             return self.parse_bogus_comment(i)
   252 
   253     # Internal -- parse bogus comment, return length or -1 if not terminated
   254     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
   255     def parse_bogus_comment(self, i, report=1):
   256         rawdata = self.rawdata
   257         if rawdata[i:i+2] not in ('<!', '</'):
   258             self.error('unexpected call to parse_comment()')
   259         pos = rawdata.find('>', i+2)
   260         if pos == -1:
   261             return -1
   262         if report:
   263             self.handle_comment(rawdata[i+2:pos])
   264         return pos + 1
   265 
   266     # Internal -- parse processing instr, return end or -1 if not terminated
   267     def parse_pi(self, i):
   268         rawdata = self.rawdata
   269         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
   270         match = piclose.search(rawdata, i+2) # >
   271         if not match:
   272             return -1
   273         j = match.start()
   274         self.handle_pi(rawdata[i+2: j])
   275         j = match.end()
   276         return j
   277 
   278     # Internal -- handle starttag, return end or -1 if not terminated
   279     def parse_starttag(self, i):
   280         self.__starttag_text = None
   281         endpos = self.check_for_whole_start_tag(i)
   282         if endpos < 0:
   283             return endpos
   284         rawdata = self.rawdata
   285         self.__starttag_text = rawdata[i:endpos]
   286 
   287         # Now parse the data between i+1 and j into a tag and attrs
   288         attrs = []
   289         match = tagfind.match(rawdata, i+1)
   290         assert match, 'unexpected call to parse_starttag()'
   291         k = match.end()
   292         self.lasttag = tag = match.group(1).lower()
   293 
   294         while k < endpos:
   295             m = attrfind.match(rawdata, k)
   296             if not m:
   297                 break
   298             attrname, rest, attrvalue = m.group(1, 2, 3)
   299             if not rest:
   300                 attrvalue = None
   301             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
   302                  attrvalue[:1] == '"' == attrvalue[-1:]:
   303                 attrvalue = attrvalue[1:-1]
   304             if attrvalue:
   305                 attrvalue = self.unescape(attrvalue)
   306             attrs.append((attrname.lower(), attrvalue))
   307             k = m.end()
   308 
   309         end = rawdata[k:endpos].strip()
   310         if end not in (">", "/>"):
   311             lineno, offset = self.getpos()
   312             if "\n" in self.__starttag_text:
   313                 lineno = lineno + self.__starttag_text.count("\n")
   314                 offset = len(self.__starttag_text) \
   315                          - self.__starttag_text.rfind("\n")
   316             else:
   317                 offset = offset + len(self.__starttag_text)
   318             self.handle_data(rawdata[i:endpos])
   319             return endpos
   320         if end.endswith('/>'):
   321             # XHTML-style empty tag: <span attr="value" />
   322             self.handle_startendtag(tag, attrs)
   323         else:
   324             self.handle_starttag(tag, attrs)
   325             if tag in self.CDATA_CONTENT_ELEMENTS:
   326                 self.set_cdata_mode(tag)
   327         return endpos
   328 
   329     # Internal -- check to see if we have a complete starttag; return end
   330     # or -1 if incomplete.
   331     def check_for_whole_start_tag(self, i):
   332         rawdata = self.rawdata
   333         m = locatestarttagend.match(rawdata, i)
   334         if m:
   335             j = m.end()
   336             next = rawdata[j:j+1]
   337             if next == ">":
   338                 return j + 1
   339             if next == "/":
   340                 if rawdata.startswith("/>", j):
   341                     return j + 2
   342                 if rawdata.startswith("/", j):
   343                     # buffer boundary
   344                     return -1
   345                 # else bogus input
   346                 self.updatepos(i, j + 1)
   347                 self.error("malformed empty start tag")
   348             if next == "":
   349                 # end of input
   350                 return -1
   351             if next in ("abcdefghijklmnopqrstuvwxyz=/"
   352                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
   353                 # end of input in or before attribute value, or we have the
   354                 # '/' from a '/>' ending
   355                 return -1
   356             if j > i:
   357                 return j
   358             else:
   359                 return i + 1
   360         raise AssertionError("we should not get here!")
   361 
   362     # Internal -- parse endtag, return end or -1 if incomplete
   363     def parse_endtag(self, i):
   364         rawdata = self.rawdata
   365         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
   366         match = endendtag.search(rawdata, i+1) # >
   367         if not match:
   368             return -1
   369         gtpos = match.end()
   370         match = endtagfind.match(rawdata, i) # </ + tag + >
   371         if not match:
   372             if self.cdata_elem is not None:
   373                 self.handle_data(rawdata[i:gtpos])
   374                 return gtpos
   375             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
   376             namematch = tagfind_tolerant.match(rawdata, i+2)
   377             if not namematch:
   378                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
   379                 if rawdata[i:i+3] == '</>':
   380                     return i+3
   381                 else:
   382                     return self.parse_bogus_comment(i)
   383             tagname = namematch.group().lower()
   384             # consume and ignore other stuff between the name and the >
   385             # Note: this is not 100% correct, since we might have things like
   386             # </tag attr=">">, but looking for > after tha name should cover
   387             # most of the cases and is much simpler
   388             gtpos = rawdata.find('>', namematch.end())
   389             self.handle_endtag(tagname)
   390             return gtpos+1
   391 
   392         elem = match.group(1).lower() # script or style
   393         if self.cdata_elem is not None:
   394             if elem != self.cdata_elem:
   395                 self.handle_data(rawdata[i:gtpos])
   396                 return gtpos
   397 
   398         self.handle_endtag(elem)
   399         self.clear_cdata_mode()
   400         return gtpos
   401 
   402     # Overridable -- finish processing of start+end tag: <tag.../>
   403     def handle_startendtag(self, tag, attrs):
   404         self.handle_starttag(tag, attrs)
   405         self.handle_endtag(tag)
   406 
   407     # Overridable -- handle start tag
   408     def handle_starttag(self, tag, attrs):
   409         pass
   410 
   411     # Overridable -- handle end tag
   412     def handle_endtag(self, tag):
   413         pass
   414 
   415     # Overridable -- handle character reference
   416     def handle_charref(self, name):
   417         pass
   418 
   419     # Overridable -- handle entity reference
   420     def handle_entityref(self, name):
   421         pass
   422 
   423     # Overridable -- handle data
   424     def handle_data(self, data):
   425         pass
   426 
   427     # Overridable -- handle comment
   428     def handle_comment(self, data):
   429         pass
   430 
   431     # Overridable -- handle declaration
   432     def handle_decl(self, decl):
   433         pass
   434 
   435     # Overridable -- handle processing instruction
   436     def handle_pi(self, data):
   437         pass
   438 
   439     def unknown_decl(self, data):
   440         pass
   441 
   442     # Internal -- helper to remove special character quoting
   443     entitydefs = None
   444     def unescape(self, s):
   445         if '&' not in s:
   446             return s
   447         def replaceEntities(s):
   448             s = s.groups()[0]
   449             try:
   450                 if s[0] == "#":
   451                     s = s[1:]
   452                     if s[0] in ['x','X']:
   453                         c = int(s[1:], 16)
   454                     else:
   455                         c = int(s)
   456                     return unichr(c)
   457             except ValueError:
   458                 return '&#'+s+';'
   459             else:
   460                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
   461                 # which is not part of HTML 4
   462                 import htmlentitydefs
   463                 if HTMLParser.entitydefs is None:
   464                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
   465                     for k, v in htmlentitydefs.name2codepoint.iteritems():
   466                         entitydefs[k] = unichr(v)
   467                 try:
   468                     return self.entitydefs[s]
   469                 except KeyError:
   470                     return '&'+s+';'
   471 
   472         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)