#coding:utf-8
class ::String
def find_tag_idx(tag,count=1)
#html=html.to_s
html=self
current_count=0
idx=0
start=0
puts "==========="
puts count
puts current_count
while current_count < count
idx=html.index(tag,start)
unless idx.nil?
current_count+=1
if tag.class==String
start=idx+tag.length
else
start=idx+html[start..-1].slice(tag).length
end
else
break
end
end
r=unless idx.nil?
idx..start-1
else
nil
end
end
def rfind_tag_idx(tag,count=1)
#html=html.to_s
html=self
current_count=0
idx=0
start=html.length
puts "==========="
puts start
puts count
puts current_count
l=-1
while current_count < count
idx=html.rindex(tag,start)
puts ">>>>>"
puts idx
unless idx.nil?
current_count+=1
start=idx-1
if tag.class==String
l=tag.length
else
l=html.scan(tag).reverse[current_count-1]
if l.nil?
l=0
else
l=l.length
end
end
else
break
end
end
puts "lllllllllllllllllllll"
puts l
r=if (idx.nil? or l<1)
nil
else
idx..idx+l-1
end
end
#left=false时,s还是从左边起始算,e为右边起始的算,在内部算时先算e这边,再推到左边过来
def get_part2(args={:s=>nil,:e=>nil,:s_count=>1,:e_count=>1,:left=>true})
if args[:left]
html=self
r=""
if html.nil?
return ""
else
html=html.to_s
end
_html=html
if args.length==0
return _html
else
s=0
e=-1
unless args[:s].nil?
s_range=_html.find_tag_idx(args[:s],args[:s_count])
if s_range.nil?
return ""
else
puts "s_range"
puts _html[s_range]
s=s_range.last+1
_html=_html[s..-1]
puts _html
end
end
unless args[:e].nil?
puts "--------------------------------"
puts args[:e],args[:e_count]
puts _html
e_range=_html.find_tag_idx(args[:e],args[:e_count])
if e_range.nil? or e_range==(0..0)
return ""
else
puts "e_range"
puts _html[e_range]
puts e_range
e=e_range.first-1
end
end
puts "s==>"
puts s
puts "e==>"
puts e
puts "dddddddddddddddddddd"
return _html[0..e]
end
else
puts "rightTTTTTTTTTTTTTTTTTTTT"
html=self
r=""
if html.nil?
return ""
else
html=html.to_s
end
_html=html
if args.length==0
return _html
else
s=0
e=-1
unless args[:e].nil?
puts "--------------------------------"
puts args[:e],args[:e_count]
puts _html
e_range=_html.rfind_tag_idx(args[:e],args[:e_count])
if e_range.nil?# or e_range==(0..0)
return ""
else
puts "e_range"
puts _html[e_range]
puts e_range
e=e_range.first-1
_html=_html[0..e]
puts _html
end
end
unless args[:s].nil?
s_range=_html.rfind_tag_idx(args[:s],args[:s_count])
if s_range.nil?
return ""
else
puts "s_range"
puts _html[s_range]
puts s_range
s=s_range.last+1
puts _html
end
end
puts "s==>"
puts s
puts "e==>"
puts e
puts "dddddddddddddddddddd"
return _html[s..-1]
end
end
end
end
#html="abcd_abcd"
#s="a"
#e="d"
#r=html.find_tag_idx(/bc/,count=1)
#puts r
#if r.nil?
# puts "为nil"
#else
# puts html[r]
#end
#html="abcd_abcd"
#s="a"
#e="d"
#puts html.get_part2(:s=>"c",:e=>"d",:s_count=>1,:e_count=>2,:left=>true)
#html="abcd_abceddd"
#s="a"
#e="d"
##r=html.rfind_tag_idx(/b\w?c/,count=1)
#r=html.rfind_tag_idx("ed",count=1)
#puts r
#if r.nil?
# puts "为nil"
#else
# puts html[r]
#end
html="abcd_abcdd"
puts html.get_part2(:e=>/d/,:s_count=>2,:e_count=>2,:left=>true)
def get_part(html,args={:s=>nil,:e=>nil})
r=""
if html.nil?
return ""
else
html=html.to_s
end
_html=html
if args.length==0
return _html
else
s=0
e=-1
unless args[:s].nil?
s_idx=_html.index(args[:s])
if s_idx.nil?
return ""
end
if args[:s].class==String
s=s_idx+args[:s].length
else#使用正则的
s=s_idx+_html.slice(args[:s]).length
end
puts "~~~~~~~~~~~~~~~~~~~~~~~~~"
puts s
end
unless args[:e].nil?
e_idx=_html.index(args[:e],s+1)
puts e_idx
if e_idx.nil?
return ""
else
e=e_idx-1
end
end
puts "s==>"
puts s
puts "e==>"
puts e
_html[s..e]
end
end
#html="abcd_abcd"
#s="a"
#e="d"
#puts get_part(html,:s=>/c/,:e=>"d")
#a="中国a"
#puts a.size
#puts a.length
#a="abc"
#puts a.size
#puts html.scan("ab")
#puts $&
附python版本
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.17,v0.32
2010.10.16,v0.30
2010.10.15,v0.29
2010.10.14,v0.27
2010.10.13,v0.26
2010.10.12,v0.25
2010.10.10,v0.23
2010.10.10,v0.22
2010.10.09,v0.2
2010.10.07,v0.1
批量抽取详细页数据
"""
import sys
#print sys.getdefaultencoding()
####reload(sys)#生成代码时,####的会自动去除
#sys.setdefaultencoding(sys.stdout.encoding)
####sys.setdefaultencoding('utf-8')
import re
import time
import urllib2
import os.path
from urlparse import urljoin
from pyquery import PyQuery as pq
from xml.dom import minidom,Node
import types
g_host = "${host}"
g_details_folder = os.path.join("./","details")
g_xmls_folder = os.path.join("./","xmls")
g_xmls_infos_folder = os.path.join("./","xmls_infos")
g_success_file = os.path.join("./","xmls_infos/success.txt")
g_error_file = os.path.join("./","xmls_infos/error.txt")
g_extract_links_file = os.path.join("./","details_infos/success.txt")
g_headers={}
headers = """${headers}"""
headers = headers.strip().replace("\r\n","\n")
if headers<>"":
for elem in headers.split("\n"):
if elem.strip()=="":
continue
a,b=elem.split(":",1)
a=a.strip()
b=b.strip()
g_headers[a]=b
#存放抽取的数据
dict={}
#功能函数
def init():
print "初始数据"
if not os.path.exists(g_xmls_folder):
os.makedirs(g_xmls_folder)
if not os.path.exists(g_xmls_infos_folder):
os.makedirs(g_xmls_infos_folder)
if not os.path.exists(g_details_folder):
os.makedirs(g_details_folder)
def delete(src):
'''delete files and folders'''
#permission(src)
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass
def clear():
print "清除以前数据"
delete(g_xmls_folder)
delete(g_xmls_infos_folder)
def size(src):
"检查文件或文件夹大小"
r = 0L
if os.path.isfile(src):
r=os.path.getsize(src)
else:
for root, dirs, files in os.walk(src):
r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
l=len(str(r))
if l>9:
r=r/1024/1024/1024
r="%.2f GiB"%r
elif l>6:
r=r/1024/1024
r="%.2f MiB"%r
elif l>3:
r=r/1024
r="%.2f KiB"%r
print "%s 大小为:%s"%(src,r)
def error(url,ex):
f=open(g_error_file,"a")
f.write("%s\n"%(url,))
f.close()
def success(url):
f=open(g_success_file,"a")
f.write("%s\n"%url)
f.close()
def statistics(func):
def tongji():
total,successed=0,0
if os.path.exists(g_extract_links_file):
total=len(set(open(g_extract_links_file,"r").readlines()))
print "total lines:%s"%total
if os.path.exists(g_success_file):
successed=len(set(open(g_success_file,"r").readlines()))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
def newFunc(*args,**args2):
tongji()
back = func(*args, **args2)
tongji()
return back
return newFunc
def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc
def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)
def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
# raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r
def get_html(url):
init()
html=""
file=url2filename(url)
file=os.path.join(g_details_folder,file)
print file
if not os.path.exists(file):
print url
req = urllib2.Request(url = url,headers = g_headers)
html=urllib2.urlopen(req).read()
#html=urllib2.urlopen(url).read()
open(file,"w").write(html)
print "从网络抓取"
else:
print "直接利用本地"
html=open(file,"r").read()
return html
def get_part(html,start,end):
s=html.find(start)
l=len(start)
e=html.find(end,s+l)
if s==-1 or e==-1:
return ""
return html[s+len(start):e]
def find_tag_idx(html,tag,count=1,use_re=False):
"""
查找特征值的位置
-1,表示没有找到
html="abcabc"
print find_tag_idx(html,'e')
"""
r=(-1,0)
current_count=0
idx=-1
if not use_re:
start=0
while(current_count<count):
idx=html.find(tag,start)
if idx>-1:
current_count+=1
start=idx+len(tag)
else:
break
r=(idx,idx+len(tag))
else:
for match in re.finditer(tag,html):
if (current_count+1)==count:
r=match.span()
break
current_count+=1
return r
def rfind_tag_idx(html,tag,count=1,use_re=False):
"""
查找特征值的位置,从右边开始
-1,表示没有找到
html="abcabc"
a,b=rfind_tag_idx(html,'a',count=1,use_re=True)
print a,b
print html[a:b]
"""
r=(-1,0)
current_count=0
idx=-1
if not use_re:
end=len(html)
while(current_count<count):
idx=html.rfind(tag,0,end)#end位是包含的
if idx>-1:
current_count+=1
end=idx
else:
break
r=(idx,idx+len(tag))
else:
matchs=[match for match in re.finditer(tag,html)][::-1]
for match in matchs:
if (current_count+1)==count:
r=match.span()
break
current_count+=1
return r
def get_part2(html,start,end,start_count=1,end_count=1,start_re=False,end_re=False,reverseDirection=False):
"""
取得start,end中间的数据,不包括start和end中的字符
html="abcabc"
print get_part2(html,"a","c",start_count=2)
reverseDirection为True时,start还是从左边起始算,end为右边起始的算,在内部算时先算end这边,再推到左边过来
"""
if not reverseDirection:
a,b=find_tag_idx(html,start,start_count,use_re=start_re)
if a==-1:
return ""
#print a,b
_html=html[b:]
#print _html
c,d=find_tag_idx(_html,end,end_count,use_re=end_re)
#print c,d
if c==-1:
return ""
return _html[:c]
else:
a,b=rfind_tag_idx(html,end,end_count,use_re=end_re)
#print a,b
if a==-1:
return ""
_html=html[:a]
#print _html
c,d=rfind_tag_idx(_html,start,start_count,use_re=start_re)
if c==-1:
return ""
#print c,d
return _html[d:]
def filter_tags(html,tags=["em","dd","input","h1","h2","h3","br","a","b","span","strong","p","hr","strong","p","hr","font","div","td","tr","img","form","table"]):
result=html
for elem in tags:
result=re.sub(r"(?i)<%s[\s\S]*?>"%elem,"",result)
result=re.sub(r"(?i)</ *%s[\s\S]*?>"%elem,"",result)
return result
def filter_comment(html):
r=re.sub(r"<!--[\s\S]*?-->",'', html)
return r
def filter_characters(html,tags=["¥"," ","]",":"]):
for tag in tags:
html=html.replace(tag,"")
return html
def filter_int(html):
r=re.sub(r"(?m)[^\d]+",'', html).strip()
try:
return str(int(r))
except:
return "0"
def filter_price(html):
r=re.sub(r"(?m)[^\d\.]*",'', html).strip()
try:
return str(float(r))
except:
return "0"
def _(u):
if not isinstance(u,unicode):
return unicode(u,"utf8")
return u
def gen_xml(url):
xml_filename=os.path.join(g_xmls_folder,url2filename(url)+".xml")
xml=minidom.Document()
add=xml.createElement("add")
xml.appendChild(add)
doc=xml.createElement("doc")
add.appendChild(doc)
def c(na,va):
"create field node"
field=xml.createElement("field")
field.setAttribute("name",na)
field.appendChild(xml.createTextNode(va))
doc.appendChild(field)
for k in dict.keys():
if dict[k] is None or ((type(dict[k])==types.StringType or type(dict[k])==types.UnicodeType) and dict[k].strip()==""):
del dict[k]
for k,v in dict.iteritems():
c(k,str(v))#stock等数值类型,在抽取时也使用字符类型
import codecs
f=codecs.open(xml_filename,"w")
f.write(codecs.BOM_UTF8)
f.write(xml.toxml("utf-8"))
f.close()
print "生成文件%s"%xml_filename