import json from lxml import etree class HtmlParser(object): """这是HtmlParser""" # 提取urls def _get_new_urls(self): pass # 提取内容 def _get_new_data(self): pass def parser(self, page_url, html_cont_str): if page_url is None or html_cont_str is None: return # dict_data=json.loads(html_cont) html_etree = etree.HTML(html_cont_str) # 获取element 类型的html # node_list = html_etree.xpath("//div[@id='u1']/a") # 获得节点 node_list = html_etree.xpath("//a[starts-with(@href,'http')]|//a[starts-with(@href,'//')]") # 获得节点 print(len(node_list)) # 遍历节点 i = 1 for node in node_list: a_href = node.xpath("./@href")[0] # a_href=node.xpath("./text()") print('No.%3s: %s' % (i, a_href)) i += 1 new_urls = self._get_new_urls() new_data = self._get_new_data() return new_urls, new_data pass