爬虫练习

#豆瓣电影 re爬虫
import
requests,re,csv url = "https://movie.douban.com/top250" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers) page_connect = resp.text #解析数据 obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)' r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?<span ' r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?' r'<span>(?P<num>.*?)人评价</span>',re.S) #开始匹配 result = obj.finditer(page_connect) f = open("data.csv",mode="w") csvwriter = csv.writer(f) for it in result: # print(it.group("name")) # print(it.group("score")) # print(it.group("num")) # print(it.group("year").strip()) #srtip去空格 #使用字典 dic = it.groupdict() dic['year'] = dic['year'].strip() csvwriter.writerow(dic.values()) f.close() print("over!")
#电影天堂 re爬虫
import
requests,re,csv url = "https://www.dytt8.net/" headers={ "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0" } resp = requests.get(url,headers=headers,verify=False) resp.encoding = 'gb2312' #指定字符集 #匹配ul的ui obj1 = re.compile(r"最新影片推荐.*?<ul>(?P<ul>.*?)</ul>",re.S) obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S) obj3 = re.compile(r'◎片  名 (?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<download>.*?)">', re.S) result1 = obj1.finditer(resp.text) #保存子页面 list = [] for it in result1: ul = it.group('ul') #提取子页面链接 result2 = obj2.finditer(ul) for itt in result2: #子页面链接 url2 = url + itt.group('href').strip("/") list.append(url2) #print(url2) #提取子页面内容 for href in list: url2 = requests.get(href, headers=headers, verify=False) url2.encoding = 'gb2312' # 指定字符集 #print(url2.text) result3 = obj3.search(url2.text) print(result3.group("movie")) print(result3.group("download")) break
#bs4爬虫
import requests,re,csv
from bs4 import BeautifulSoup
url = "http://www.bjtzh.gov.cn/bjtz/home/jrcj/index.shtml"
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
resp.encoding = 'utf-8' #指定字符集
#写入文件
f = open("菜价.csv",mode="w")
csvwriter = csv.writer(f)
#解析数据
#1.把源代码交给beautifulSoup进行处理,生成bs对象
page = BeautifulSoup(resp.text,"html.parser") #指定html解析
#2.从bs对象查找数据
#find(标签,属性=值)
#find_all(标签,属性=值)
#第一种写法
#div = page.find("div",class_="m-r-main m-textLists")
#class是python的关键字,这里用class_区分,防止报错
#第二种写法
div = page.find("div",attrs={"class":"m-r-main m-textLists"})
#第二种可以避免class
#拿到所有数据行tr
trs = div.find_all("tr")[1:]  #[1:]做切片 从第1个开始
for tr in trs: #每行数据
    tds = tr.find_all("td") #每行的td
    name = tds[0].text
    class1 = tds[1].text
    high = tds[2].text
    avg = tds[3].text
    #print(name,class1,high,avg)
    csvwriter.writerow([name,class1,high,avg])
f.close()
print("over!")
#彼岸壁纸爬取下载。 提前创建img文件夹或者修改脚本
import requests,re,csv,time
from bs4 import BeautifulSoup
url = "https://pic.netbian.com/4kmeinv/"
url1 = "https://pic.netbian.com"
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
resp.encoding = 'gbk' #指定字符集
#print(resp.text)
#解析数据
page = BeautifulSoup(resp.text,"html.parser") #指定html解析
#2.从bs对象查找数据
#find(标签,属性=值)
#find_all(标签,属性=值)
div = page.find("div",class_="slist").find_all("a")
#print(div)
for a in div:
    href = url1+(a.get('href'))
    #print(href)
    #获取子页面源码
    resp2 = requests.get(href, headers=headers)
    resp2.encoding = 'gbk'  # 指定字符集
    page2 = BeautifulSoup(resp2.text, "html.parser")
    div2 = page2.find("div",class_="photo-pic")
    img = div2.find("img")
    src = url1+(img.get("src"))
    #print(src)
#下载图片
    img_resp = requests.get(src)
    #img_resp.content #获取字节
    img_name = src.split("/")[-1]
    # 获取最后/的内容,举例https://pic.netbian.com/uploads/allimg/210831/102129-163037648996ad.jpg
    # 从中获取 102129-163037648996ad.jpg
    with open("img/"+img_name,mode="wb") as f:   #放入img文件夹
        f.write(img_resp.content) #图片内容写入文件
        f.close()
    print(img_name +" is Download  OK")
    time.sleep(0.5)
print("OVER")
#线程池+xpath提取
import requests,re,csv,lxml
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
f = open("1.csv",mode="w",encoding="UTF-8")
csvwriter = csv.writer(f)
def page1(url):
    resp = requests.get(url,headers=headers)
    #resp.encoding = "UTF-8"  # 指定字符集
    #print(resp.text)
    html = etree.HTML(resp.text)
    table = html.xpath("/html/body/div[4]/div[3]/div[3]/table[2]/tbody")[0]
    #print(table)
    trs = table.xpath("./tr")
    #截取tr
    for tr in trs:
        txt = tr.xpath("./td/text()")
        #print(txt)
        #对数据做简单的处理
        txt = (item.replace("xa0","") for item in txt)
        #print(list(txt))
        #存放数据
        csvwriter.writerow(txt)
    print(url+"提取完成")
if __name__ == '__main__':
    #page1("http://www.maicainan.com/offer/show/classid/14/id/4652.html")
    #创建线程池
    with ThreadPoolExecutor(50) as t:     #500个线程
        for i in range(11,99):              #200个任务
            #任务提交到线程池
            t.submit(page1,f"http://www.maicainan.com/offer/show/classid/14/id/46{i}.html")
    print("全部提取完成")
原文地址:https://www.cnblogs.com/bingtang123/p/15374364.html