爬取英文名详细内容

csv存储使用html_save(s)函数

图片存储使用pic_save(url,name)函数

爬取时首先爬取首页所有英文名详细内容的链接并存入列表,然后将列表中的链接依次爬取,并调用存储函数存储价值数据。

 1 import sys
 2 import io
 3 import re
 4 sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
 5 import requests
 6 from bs4 import BeautifulSoup
 7 from urllib import request
 8 
 9 def html_save(s):
10     with open('Name.csv','a',encoding='gb18030')as f:
11         f.write(s+'
')
12 def pic_save(url,name):
13     root="C://Users//L//Desktop//ba//"
14     # name=url.split('=')[-1]
15     path=root+name+'.jpg'
16     r=requests.get(url)
17     with open(path,'wb')as f:
18         f.write(r.content)
19         f.close()
20         # print('ok')
21 # soup = BeautifulSoup(html,'index')
22 def getName_link():
23     lst=[]
24     url='http://www.babynology.com/baby-boy-names.html'
25     r=requests.get(url)
26     soup= BeautifulSoup(r.text,'html.parser')
27     # soup = BeautifulSoup(open('Girl.html'))
28     for div in soup.find_all('div',{'class':'babynology_textevidence babynology_bg_grey babynology_shadow babynology_radius left overflow_scroll'}):
29         for strong in div.find_all('strong'):
30             # print(strong.find_all('a')[0].text.replace('    ','').replace(' ','').replace('
',''))
31             # print(strong.find_all('a')[0].get('href').replace('
',''))
32             i=strong.find_all('a')[0].text.replace('    ','').replace(' ','').replace('
','')
33             j=strong.find_all('a')[0].get('href').replace('
','')
34             lst.append(j)
35             # html_save(i)
36             # html_save(j)
37     # # print(lst)        
38     return lst
39 
40 def hh(lst):
41     for i in lst:
42         url=i
43     # url='http://www.babynology.com/name/bahula-m.html'
44         r=requests.get(url)
45         soup= BeautifulSoup(r.text,'html.parser')
46         name=soup.find('h2',{'class':'txtclrm name-head2'}).text
47         print("Name:",name)
48         # print(soup)
49         #gender=soup.find('div',{'class':'grid grid_8'})#.find('div',{'class':'babynology_textevidence babynology_width_percentage40 babynology_width100_responsive'})
50         gender=soup.find('h5',{'style':'color:#000;'}).text
51         print("Gender:",gender)
52         # Numerology=soup.find('h5',{'style':'color:#000; text-align:justify;'}).stripped_strings
53         # font=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('font').text
54         # print(type(Numerology))
55         # Numerology=str(Numerology)
56         Numerology=soup.find('h5',{'style':'color:#000; text-align:justify;'}).text.replace('   ','').replace('
','').replace('    ','')
57         a=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('span').text.replace('   ','').replace('
','').replace('    ','')
58         b=soup.find('h5',{'style':'color:#000; text-align:justify;'}).find('script').text.replace('   ','').replace('
','').replace('    ','')
59         n=Numerology.strip(b).strip(a)
60         # print(name,'Numerology:',font,"%s"%list(Numerology)[1].replace('
','').replace('    ','').replace('      ',''))
61         print(name,'Numerology:',Numerology.strip(b).strip(a))
62         n=Numerology.strip(b).strip(a)
63         n=name+' Numerology:'+n
64         n=n.replace('',' ').replace(',',' ')
65         # n=n.encode('UTF-8','ignore').decode('UTF-8')
66         print(n)
67         # url='http://www.babynology.com/name/bahula-m.html'
68         r=requests.get(url)
69         pic=soup.find('img',{'style':'margin-left:-10px; margin-top:-5px;'}).get('src')
70         # print(pic)
71         html_save('Name:'+name)
72         html_save('Gender:'+gender)
73         html_save(n)
74         pic_save(pic,name)
75         # html_save('--------------------------------------------------------------------------------------------------------------------------')
76         print('---------------------------------------------------------------------------')
77         # print(name,'Numerology:',Numerology.strip(b).strip(a))
78 hh(getName_link())
原文地址:https://www.cnblogs.com/huanghuangwei/p/12077452.html