豆瓣爬虫

豆瓣爬虫

import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from scipy.optimize import leastsq

def get_html(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'}#伪装爬虫
    resp = requests.get(url, headers = headers)
    return resp.text

url = 'https://movie.douban.com/top250'
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')


a = soup.find_all('div', class_='hd')
#电影名
film_name = []
for i in a:
    film_name.append(i.a.span.text)

#评分
rating_score = soup.find_all('span', class_='rating_num')

lt = []
num = 20
for i in range(num):
    lt.append([i+1,film_name[i], rating_score[i].string])
    df=pd.DataFrame(lt,columns = ['排名', '电影名', '评分'])
df.to_csv(r'C:UsersadmirDesktop参考豆瓣电影数据.csv') #保存文件,数据持久化

根据网页格式调整实现批量输出

import json  
import requests  
from requests.exceptions import RequestException  
import re  
import time 

def get_one_page(url):  
    try:  
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'  
        }        #网络 html发起者  请求  消息头
        response = requests.get(url, headers=headers)  
        if response.status_code == 200:  
            return response.text  
        return None  
    except RequestException:  
        return None  
def parse_one_page(html):  
   pattern = re.compile('<li>.*?<em class="">(.*?)</em>.*?title.*?>(.*?)</span>.*? <span class="rating_num" property="v:average">(.*?)</span>.*?<span class="inq">(.*?)</span>',re.S)  
   items = re.findall(pattern, html)  
   for item in items:  
        yield {'index': item[0],  
            'title': item[1],  
            'score': item[2],
            'comment':item[3]
        }  
def write_to_file(content):  
    with open(r'C:UsersadmirDesktop参考douban250.txt', 'a', encoding='utf-8') as f:  
#写入txt文件;如果需要输出csv文件直接修改后缀即可 f.write(json.dumps(content, ensure_ascii
=False) + ' ') def main(offset): url = 'https://movie.douban.com/top250?start='+str(offset)+'&filter=' html = get_one_page(url) for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(offset=i * 25) time.sleep(1)
原文地址:https://www.cnblogs.com/celine227/p/14473221.html