COM-IE-(2)

# -*- coding:UTF-8 -*- 
import sys
from time import sleep
import win32com.client
from win32com.client import DispatchEx

stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.setdefaultencoding("utf-8")
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr

class COM_IE:
	def __init__(self,url=None):
		self.url = url
		self.Visible = 1
		self.ie = self.openIE(url)
		self.document = ""
		self.text = ""
		self.charset = None

	def ExistIE(self,url):
		ShellWindowsCLSID = '{9BA05972-F6A8-11CF-A442-00A0C90A8F39}'
		ies=DispatchEx(ShellWindowsCLSID)
		if len(ies)==0:
			return None
		for ie in ies:
			if ie.LocationURL==url:
				return ie
		return None
		
	def NewIE(self,url):
		ie = DispatchEx("InternetExplorer.Application")
		ie.Visible = self.Visible
		ie.Navigate(url)
		return ie
		
	def openIE(self,url):
		ie = self.ExistIE(url)
		if ie==None:
			ie = self.NewIE(url)
		return ie
		
	def WaitIE(self):
		# while self.ie.Busy:
			# leep(1)
		while 1:    
			state = self.ie.ReadyState    
			if state ==4: 
				# print "load done..."
				self.charset  = self.ie.Document.charset
				self.document = self.ie.Document.body.innerHTML
				self.text = self.ie.Document.body.innerText 
				break    
			sleep(1)
			
	def Visible(self):
		self.ie.Visible = self.Visible
		
	def GetBody(self):
		self.WaitIE()
		return self.ie.Document.body
		
	def GetNodes(self,parentNode,tag):
		"""
		>>> coldiv=GetNodes(body,"div")
		"""
		childNodes=[]
		for childNode in parentNode.getElementsByTagName(tag):
			childNodes.append(childNode)
		return childNodes
		
	def NodeByAttr(self,Nodes,nodeattr,nodeval):
		"""
		>>> div_id_editor=NodeByAttr(coldiv,"id","editor_ifr")
		"""
		for node in Nodes:
			if str(node.getAttribute(nodeattr))==nodeval:
				return node
		return None

	def SetNodeHtml(self,body,node_type,node_attr,node_attr_val,node_inner_html):
		tags = self.GetNodes(body,node_type)
		node = self.NodeByAttr(tags,node_attr,node_attr_val)
		node.innerHTML = node_inner_html
			
	
	def SetNodeVal(self,body,node_type,node_attr,node_attr_val,node_value):
		tags = self.GetNodes(body,node_type)
		node = self.NodeByAttr(tags,node_attr,node_attr_val)
		node.value = node_value

	def NodeClick(self,body,node_type,node_attr,node_attr_val):
		tags = self.GetNodes(body,node_type)
		node = self.NodeByAttr(tags,node_attr,node_attr_val)
		node.click()
		
	def GetNodeHtml(self,body,node_type,node_attr,node_attr_val):
		tags = self.GetNodes(body,node_type)
		node = self.NodeByAttr(tags,node_attr,node_attr_val)
		html = node.innerHTML
		return html
		
	def GetNodeVal(self,body,node_type,node_attr,node_attr_val):
		tags = self.GetNodes(body,node_type)
		node = self.NodeByAttr(tags,node_attr,node_attr_val)
		value = node.value 
		return value
		
		
	#mutiple nodes
	def NodesByAttr(self,Nodes,nodeattr=None,nodeval=None):
		"""
		>>> div_id_editor=NodeByAttr(coldiv,"id","editor_ifr")
		"""
		value_list = []
		for node in Nodes:
			# print node.nodeType,node.nodeName	#,node.getAttribute("id"),node.innerText 
			value_dict = {}
			if not nodeattr:
				nodeattr_list = ["id","nodeName","nodeType","nodeValue","className",
				"innerHTML","innerText","href","name","title","type","value"]
				for attr in nodeattr_list:
					value_dict[attr] = node.getAttribute(attr)
				value_list.append(value_dict)	
			else:
				if not nodeval:
					value_dict[nodeattr] = node.getAttribute(nodeattr)
					value_list.append(value_dict)
				else:
					if str(node.getAttribute(nodeattr))==nodeval:
						value_dict[nodeattr] = node.getAttribute(nodeattr)
						value_list.append(value_dict)	
		return value_list
	
	#mutiple nodes	
	def GetNodesVal(self,body,node_type,node_attr=None,node_val=None):
	
		# print '*'*50
		tags = self.GetNodes(body,node_type)
		value_list = self.NodesByAttr(tags,node_attr,node_val)	
		return value_list

	def Quit(self):
		self.ie.Quit()


if __name__=="__main__":

	url = "http://blog.csdn.net/agoago_2009/"
	IE = COM_IE(url)
	BODY = IE.GetBody()
	
	# a_list = IE.GetNodesVal(BODY,"a","href")
	a_list = IE.GetNodesVal(BODY,"a")
	for a in a_list:
		print a.get("innerText"),a.get("href")
	
	
	
	'''
	IE.SetNodeVal(BODY,"input","id","inputSearch","COM")
	IE.NodeClick(BODY,"input","id","btnSubmit")
	
	IE.WaitIE()
	print IE.document.strip()[:100]
	print IE.charset
	print IE.text.strip()[:100]
	'''
	
	raw_input('quit')
	IE.Quit()		
	
	
	

原文地址:https://www.cnblogs.com/zsychanpin/p/6936240.html