【url ---lib___】笔趣阁(抓取斗罗大陆完整)和(三寸天堂)

 1 # coding=gbk  #因为在黑屏下执行,所以代码会使用GBK
 2 url='http://www.biquge.info/10_10218/'
 3 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
 4 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
 5     'Host':'www.xxbiquge.com',
 6     'Referer':'https://www.xxbiquge.com/2_2278/'}
 7 import time,lxml,pymysql
 8 from lxml import etree
 9 from urllib.request import Request
10 from urllib.request import urlopen
11 import os,sys,io
12 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 
13 
14 def source(url):#获取源
15    global UA
16    text=urlopen(Request(url,None,UA),timeout=5)
17    return text.read()
18 
19 def respon(text):#解析章
20    global url
21    seletor=etree.HTML(text)
22    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")
23    return url1
24 
25 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html')
26    global UA1
27    for i in url:
28       i='https://www.xxbiquge.com'+i
29       a=urlopen(Request(i,None,UA1),timeout=5).read()
30       seletor=etree.HTML(a)
31       text=seletor.xpath('//*[@id="content"]/text()')#内容
32       c=''
33       for aa in text:
34           c=c+aa
35        
36       text1=seletor.xpath('//html/head/title/text()')[0].split('-')[0]#章节名
37       #print(i,type(i),text1,type(text1))
38       mysqlw(c,i,text1)
39       time.sleep(3)
40 
41 
42 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'2.html')
43 #with open(c,'r') as f:
44 #   a=f.read()
45 
46 def mysqlw(text,url,chapter):#写内容
47    b1=time.time()
48    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
49    cur=b.cursor()
50    print(url,chapter,'w')
51    
52    #for i in cur.fetchall():
53        #pass
54    sql="""insert into douludalu(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter)
55    print(sql)
56    try:
57        cur.execute(sql)
58        b.commit()
59        print("插入成功")
60    except Exception as e:
61        print(e) 
62        b.rollback()
63    b.close()
64    print("关闭",'耗时',time.time()-b1)
65    
66 def mysqlr(text):#读内容
67    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
68    cur=b.cursor()
69    sql='select * from douludalu where html="%s%s%%s"'%(',text,')
70    cur.execute(sql)
71    print(sql)
72    for i in cur.fetchall():
73        a=i[0]
74        b=i[3]
75        print(a,b)
76    
77 #a='2唐三已经挥出了八千余锤,铁坨不断的变小,已经不到最初时三分'
78 #mysqlw(a,'1.html','第一章') 
79 def main():
80    a=source('https://www.xxbiquge.com/2_2278/')
81    b=respon(a)
82    spider(b)
83 #mysqlr('https://www.xxbiquge.com/2_2278/1036550.html')
84 main()

 ——————————————————————————————————————————————————————————————————

 三寸天堂

 1 # coding=gbk
 2 url='http://www.biquge.info/10_10218/'
 3 UA={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
 4 UA1={"User-Agent":"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
 5     'Host':'www.biquge.com.tw',
 6     'Referer':'http://www.biquge.com.tw/14_14055/',
 7     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
 8 import time,lxml,pymysql,threading
 9 from lxml import etree
10 from urllib.request import Request
11 from urllib.request import urlopen
12 import os,sys,io
13 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 
14 
15 def source(url):#获取源
16    global UA
17    text=urlopen(Request(url,None,UA),timeout=5)
18    return text.read()
19 
20 def respon(text):#解析章
21    global url
22    seletor=etree.HTML(text)
23    url1=seletor.xpath("//*[@id='list']/dl/dd/a/@href")
24    return url1
25 
26 def spider(url):#解析内容spider('http://www.biquge.info/10_10218/5002106.html')
27    global UA1
28    i='http://www.biquge.com.tw/'+url
29    print(i)
30    a=urlopen(Request(i,None,UA1),timeout=5).read()
31    if a is None:
32        pass
33    else:
34        seletor=etree.HTML(a)
35        text=seletor.xpath('//*[@id="content"]/text()')#内容
36        c=''
37        for aa in text:
38            c=c+aa
39        
40        text1=seletor.xpath('//html/head/title/text()')[0]#章节名
41    print(text1)
42    #print(i,type(i),text1,type(text1))
43    mysqlw(c,i,text1)
44    time.sleep(3)
45 
46 
47 def mysqlw(text,url,chapter):#写内容
48    b1=time.time()
49    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
50    cur=b.cursor()
51    print(url,chapter,'11111111111111111111111111111')
52    #for i in cur.fetchall():
53        #pass
54    sql="""insert into suibian(souce,html,chapter) values('%s','%s','%s')"""%(text,url,chapter)
55    try:
56        cur.execute(sql)
57        b.commit()
58        print("插入成功")
59    except Exception as e:
60        print(e) 
61        b.rollback()
62    b.close()
63    print("关闭",'耗时',time.time()-b1)
64    
65 def mysqlr(text):#读内容
66    b1=True
67    b=pymysql.connect('localhost',port=3306,user='root',passwd='liu',db='test',charset='utf8')
68    cur=b.cursor()
69    sql='select * from douludalu where html="%s%s%%s"'%(',text,')
70    cur.execute(sql)
71    print(sql)
72    for i in cur.fetchall():
73        a=i[0]
74        b=i[3]
75        print(a,b)
76    if i[3] is None:
77        b1=False
78 
79 def main():
80    print(threading.current_thread().name)
81    cc=time.time()
82    print('开始时间%s'%cc)
83    a=source('http://www.biquge.com.tw/14_14055/')
84    b=respon(a)
85    for i in b:
86        #print(i)
87        spider(i)
88    ctime=time.time()-cc
89    print('完成耗时%s'%ctime)
90   
91 
92 #c=os.path.join(os.path.abspath(os.path.dirname(__name__)),'1.html')
93 #with open(c,'r') as f:
94 #   a=f.read() 
95 main()

特别需要注意的是UA在Request中传值会出现错误,这时需要耐心来把问题解决

容易出现的错误【

  1,协议中,referer错误,host错误

  2,网页xpath错误,目测此网站的网页还是比较规则的

不是所有的成功都是坐享其成
原文地址:https://www.cnblogs.com/Skyda/p/9179420.html