爬虫练习

#豆瓣电影 re爬虫
import requests,re,csv
url = "https://movie.douban.com/top250"
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
page_connect = resp.text
#解析数据
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
                 r'</span>.*?<p class="">.*?<br>(?P<year>.*?)&nbsp.*?<span '
                 r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
                 r'<span>(?P<num>.*?)人评价</span>',re.S)
#开始匹配
result = obj.finditer(page_connect)
f = open("data.csv",mode="w")
csvwriter = csv.writer(f)
for it in result:
    # print(it.group("name"))
    # print(it.group("score"))
    # print(it.group("num"))
    # print(it.group("year").strip())
    #srtip去空格
#使用字典
    dic = it.groupdict()
    dic['year'] = dic['year'].strip()
    csvwriter.writerow(dic.values())
f.close()
print("over!")

#电影天堂 re爬虫
import requests,re,csv
url = "https://www.dytt8.net/"
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers,verify=False)
resp.encoding = 'gb2312' #指定字符集
#匹配ul的ui
obj1 = re.compile(r"最新影片推荐.*?<ul>(?P<ul>.*?)</ul>",re.S)
obj2 = re.compile(r"<a href='(?P<href>.*?)'",re.S)
obj3 = re.compile(r'◎片　　名　(?P<movie>.*?)<br />.*?<a target="_blank" href="(?P<download>.*?)">', re.S)
result1 = obj1.finditer(resp.text)
#保存子页面
list = []
for it in result1:
    ul = it.group('ul')
#提取子页面链接
    result2 = obj2.finditer(ul)
    for itt in result2:
        #子页面链接
        url2 = url + itt.group('href').strip("/")
        list.append(url2)
        #print(url2)
#提取子页面内容
for href in list:
    url2 = requests.get(href, headers=headers, verify=False)
    url2.encoding = 'gb2312'  # 指定字符集
    #print(url2.text)
    result3 = obj3.search(url2.text)
    print(result3.group("movie"))
    print(result3.group("download"))
    break

#bs4爬虫
import requests,re,csv
from bs4 import BeautifulSoup
url = "http://www.bjtzh.gov.cn/bjtz/home/jrcj/index.shtml"
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
resp.encoding = 'utf-8' #指定字符集
#写入文件
f = open("菜价.csv",mode="w")
csvwriter = csv.writer(f)
#解析数据
#1.把源代码交给beautifulSoup进行处理，生成bs对象
page = BeautifulSoup(resp.text,"html.parser") #指定html解析
#2.从bs对象查找数据
#find(标签，属性=值)
#find_all(标签，属性=值)
#第一种写法
#div = page.find("div",class_="m-r-main m-textLists")
#class是python的关键字，这里用class_区分,防止报错
#第二种写法
div = page.find("div",attrs={"class":"m-r-main m-textLists"})
#第二种可以避免class
#拿到所有数据行tr
trs = div.find_all("tr")[1:]  #[1:]做切片 从第1个开始
for tr in trs: #每行数据
    tds = tr.find_all("td") #每行的td
    name = tds[0].text
    class1 = tds[1].text
    high = tds[2].text
    avg = tds[3].text
    #print(name,class1,high,avg)
    csvwriter.writerow([name,class1,high,avg])
f.close()
print("over!")

#彼岸壁纸爬取下载。 提前创建img文件夹或者修改脚本
import requests,re,csv,time
from bs4 import BeautifulSoup
url = "https://pic.netbian.com/4kmeinv/"
url1 = "https://pic.netbian.com"
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
resp = requests.get(url,headers=headers)
resp.encoding = 'gbk' #指定字符集
#print(resp.text)
#解析数据
page = BeautifulSoup(resp.text,"html.parser") #指定html解析
#2.从bs对象查找数据
#find(标签，属性=值)
#find_all(标签，属性=值)
div = page.find("div",class_="slist").find_all("a")
#print(div)
for a in div:
    href = url1+(a.get('href'))
    #print(href)
    #获取子页面源码
    resp2 = requests.get(href, headers=headers)
    resp2.encoding = 'gbk'  # 指定字符集
    page2 = BeautifulSoup(resp2.text, "html.parser")
    div2 = page2.find("div",class_="photo-pic")
    img = div2.find("img")
    src = url1+(img.get("src"))
    #print(src)
#下载图片
    img_resp = requests.get(src)
    #img_resp.content #获取字节
    img_name = src.split("/")[-1]
    # 获取最后/的内容，举例https://pic.netbian.com/uploads/allimg/210831/102129-163037648996ad.jpg
    # 从中获取 102129-163037648996ad.jpg
    with open("img/"+img_name,mode="wb") as f:   #放入img文件夹
        f.write(img_resp.content) #图片内容写入文件
        f.close()
    print(img_name +" is Download  OK")
    time.sleep(0.5)
print("OVER")

#线程池+xpath提取
import requests,re,csv,lxml
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
headers={
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0"
}
f = open("1.csv",mode="w",encoding="UTF-8")
csvwriter = csv.writer(f)
def page1(url):
    resp = requests.get(url,headers=headers)
    #resp.encoding = "UTF-8"  # 指定字符集
    #print(resp.text)
    html = etree.HTML(resp.text)
    table = html.xpath("/html/body/div[4]/div[3]/div[3]/table[2]/tbody")[0]
    #print(table)
    trs = table.xpath("./tr")
    #截取tr
    for tr in trs:
        txt = tr.xpath("./td/text()")
        #print(txt)
        #对数据做简单的处理
        txt = (item.replace("xa0","") for item in txt)
        #print(list(txt))
        #存放数据
        csvwriter.writerow(txt)
    print(url+"提取完成")
if __name__ == '__main__':
    #page1("http://www.maicainan.com/offer/show/classid/14/id/4652.html")
    #创建线程池
    with ThreadPoolExecutor(50) as t:     #500个线程
        for i in range(11,99):              #200个任务
            #任务提交到线程池
            t.submit(page1,f"http://www.maicainan.com/offer/show/classid/14/id/46{i}.html")
    print("全部提取完成")