集成富文本编辑器XSS预防过滤措施

# https://github.com/phith0n/python-xss-filter

import re
import copy
from html.parser import HTMLParser


class XSSHtml(HTMLParser):
    allow_tags = ['a', 'img', 'br', 'strong', 'b', 'code', 'pre',
                  'p', 'div', 'em', 'span', 'h1', 'h2', 'h3', 'h4',
                  'h5', 'h6', 'blockquote', 'ul', 'ol', 'tr', 'th', 'td',
                  'hr', 'li', 'u', 'embed', 's', 'table', 'thead', 'tbody',
                  'caption', 'small', 'q', 'sup', 'sub', 'font']
    common_attrs = ["style", "class", "name"]
    nonend_tags = ["img", "hr", "br", "embed"]
    tags_own_attrs = {
        "img": ["src", "width", "height", "alt", "align"],
        "a": ["href", "target", "rel", "title"],
        "embed": ["src", "width", "height", "type", "allowfullscreen", "loop", "play", "wmode", "menu"],
        "table": ["border", "cellpadding", "cellspacing"],
        "font": ["color"]
    }

    def __init__(self, allows=[]):
        HTMLParser.__init__(self)
        self.allow_tags = allows if allows else self.allow_tags
        self.result = []
        self.start = []
        self.data = []

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        super().close()

    def clean(self, content):
        self.feed(content)
        return self.get_html()

    def get_html(self):
        """
        Get the safe html code
        """
        for i in range(0, len(self.result)):
            if self.result[i].strip('
'):
                self.data.append(self.result[i])
        return ''.join(self.data)

    def handle_startendtag(self, tag, attrs):
        self.handle_starttag(tag, attrs)

    def handle_starttag(self, tag, attrs):
        if tag not in self.allow_tags:
            return
        end_diagonal = ' /' if tag in self.nonend_tags else ''
        if not end_diagonal:
            self.start.append(tag)
        attdict = {}
        for attr in attrs:
            attdict[attr[0]] = attr[1]

        attdict = self._wash_attr(attdict, tag)
        if hasattr(self, "node_%s" % tag):
            attdict = getattr(self, "node_%s" % tag)(attdict)
        else:
            attdict = self.node_default(attdict)

        attrs = []
        for (key, value) in attdict.items():
            attrs.append('%s="%s"' % (key, self._htmlspecialchars(value)))
        attrs = (' ' + ' '.join(attrs)) if attrs else ''
        self.result.append('<' + tag + attrs + end_diagonal + '>')

    def handle_endtag(self, tag):
        if self.start and tag == self.start[len(self.start) - 1]:
            self.result.append('</' + tag + '>')
            self.start.pop()

    def handle_data(self, data):
        self.result.append(self._htmlspecialchars(data))

    def handle_entityref(self, name):
        if name.isalpha():
            self.result.append("&%s;" % name)

    def handle_charref(self, name):
        if name.isdigit():
            self.result.append("&#%s;" % name)

    def node_default(self, attrs):
        attrs = self._common_attr(attrs)
        return attrs

    def node_a(self, attrs):
        attrs = self._common_attr(attrs)
        attrs = self._get_link(attrs, "href")
        attrs = self._set_attr_default(attrs, "target", "_blank")
        attrs = self._limit_attr(attrs, {
            "target": ["_blank", "_self"]
        })
        return attrs

    def node_embed(self, attrs):
        attrs = self._common_attr(attrs)
        attrs = self._get_link(attrs, "src")
        attrs = self._limit_attr(attrs, {
            "type": ["application/x-shockwave-flash"],
            "wmode": ["transparent", "window", "opaque"],
            "play": ["true", "false"],
            "loop": ["true", "false"],
            "menu": ["true", "false"],
            "allowfullscreen": ["true", "false"]
        })
        attrs["allowscriptaccess"] = "never"
        attrs["allownetworking"] = "none"
        return attrs

    def _true_url(self, url):
        prog = re.compile(r"^(http|https|ftp)://.+", re.I | re.S)
        if prog.match(url):
            return url
        else:
            return "http://%s" % url

    def _true_style(self, style):
        if style:
            style = re.sub(r"(\|&#|/*|*/)", "_", style)
            style = re.sub(r"e.*x.*p.*r.*e.*s.*s.*i.*o.*n", "_", style)
        return style

    def _get_style(self, attrs):
        if "style" in attrs:
            attrs["style"] = self._true_style(attrs.get("style"))
        return attrs

    def _get_link(self, attrs, name):
        if name in attrs:
            attrs[name] = self._true_url(attrs[name])
        return attrs

    def _wash_attr(self, attrs, tag):
        if tag in self.tags_own_attrs:
            other = self.tags_own_attrs.get(tag)
        else:
            other = []
        if attrs:
            for key, value in copy.deepcopy(attrs).items():
                if key not in self.common_attrs + other:
                    del attrs[key]
        return attrs

    def _common_attr(self, attrs):
        attrs = self._get_style(attrs)
        return attrs

    def _set_attr_default(self, attrs, name, default=''):
        if name not in attrs:
            attrs[name] = default
        return attrs

    def _limit_attr(self, attrs, limit={}):
        for (key, value) in limit.items():
            if key in attrs and attrs[key] not in value:
                del attrs[key]
        return attrs

    def _htmlspecialchars(self, html):
        return html.replace("<", "<") 
            .replace(">", ">") 
            .replace('"', """) 
            .replace("'", "'")


if "__main__" == __name__:
    with XSSHtml() as parser:
        ret = parser.clean("""<p><img src=1 onerror=alert(/xss/)></p><div class="left">
            <a href='javascript:prompt(1)'><br />hehe</a></div>
            <p id="test" onmouseover="alert(1)">>M<svg>
            <a href="https://www.baidu.com" target="self">MM</a></p>
            <embed src='javascript:alert(/hehe/)' allowscriptaccess=always />
            <img onerror=alert(1) src=#>""")
        print(ret)

 1 from urlparse import urlparse
 2 
 3 import bleach
 4 
 5 
 6 class XSSFilter(object):
 7     tags = ['p', 'div', 'img', 'br', 'span', 'pre', 'code', 'blockquote', 'ol', 'ul', 'li']
 8     styles = [
 9         'max-width', 'color', 'margin', 'line-height', 'display', 'padding', 'background-color',
10         'display', 'border-left', 'font-family', 'white-space', 'font-size'
11     ]
12 
13     @staticmethod
14     def allowed_src(tag, name, value):
15         if name in ('style', 'src', 'alt', 'data-w-e'):
16             return True
17         if name == 'src':
18             p = urlparse(value)
19             return XSSFilter._trusted_url(p)
20         return False
21 
22     @classmethod
23     def clean(cls, html):
24         return bleach.clean(html, tags=cls.tags, attributes=cls.allowed_src, styles=cls.styles)
25 
26     @classmethod
27     def _trusted_url(cls, url):
28         return url.netloc == 'xxxx.xxxx.com' or 'static/gif' in url.path