大二下学期团队项目(豆瓣爬取)

前面已能够爬取豆瓣电影所需要的所有信息,今日主要修改了爬取豆瓣的代码,进行了一些优化,主要两部分代码:

def insert_data(data_beans,headers,cursor,conn):
    try:
        for data_bean in data_beans:
            #20个电影数据
            for i in data_bean:
                # 分配数据
                score = i["rate"].replace(" ", "")
                director = i["directors"]  # []
                director_str = ""
                for j in director:
                    director_str = director_str + " " + j
                name = i["title"].replace(" ", "")
                img = i["cover"].replace(" ", "")
                star = i["casts"]  # []
                star_str = ""
                for j in star:
                    star_str = star_str + " " + j
                # 分配数据

                # 获取电影详细数据的网址
                url_details = i["url"].replace(" ", "")
                r = requests.get(url_details, headers=headers)
                soup_bean = BeautifulSoup(r.text, "lxml")
                # 获取详细数据
                span = soup_bean.find_all("span", {"property": "v:genre"})
                type = ""
                for i in span:
                    type = type + " " + i.text
                span = soup_bean.find_all("span", {"property": "v:runtime"})
                timelen = span[0].text.replace(" ", "")
                span = soup_bean.find_all("span", {"property": "v:initialReleaseDate"})
                date = span[0].text.replace(" ", "")
                span = soup_bean.find("a", {"class", "rating_people"})
                scorenum = span.text.replace(" ", "")
                span = soup_bean.find("span", {"property": "v:summary"})
                summary = span.text.replace(" ", "")  # 将空格去掉
                ex = ' <span class="pl">制片国家/地区:</span> (.*?)<br/>'
                test = re.findall(ex, r.text, re.S)
                area = test[0].replace(" ", "")
                ex2 = '<span class="pl">语言:</span> (.*?)<br/>'
                test = re.findall(ex2, r.text, re.S)
                language = test[0].replace(" / ", " ")
                print(url_details)
                # 获取详细数据
                sql = "insert into moviebean values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                cursor.execute(sql,
                               [name, star_str, director_str, type, area, date, summary, score, language, img, scorenum,
                                timelen])
        conn.commit()  # 提交事务 update delete insert操作 //*[@id="info"]/text()[2]
        print(f"{time.asctime()}插入数据完毕")
    except:
        traceback.print_exc()
def get_tencent_data():
    #豆瓣的网址
    url_bean = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=%E7%94%B5%E5%BD%B1&start='

    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
    }
    cursor = None
    conn = None
    conn, cursor = get_conn()
    data_beans=[]
    num=3240#1440/3020/2760/3100/3180
    b=0;
    while b<=500:
        a = 1
        b=b+1
        while a <= 1:
            num_str = '%d' % num
            num = num + 20
            a = a + 1;
            # 获取豆瓣页面电影数据
            r = requests.get(url_bean + num_str, headers=headers)
            print(num_str)
            res_bean = json.loads(r.text);
            print(url_bean+num_str)
            data_beans.append(res_bean["data"])
            print(f"{time.asctime()}开始插入数据")
        insert_data(data_beans, headers,cursor,conn)
        data_beans=[]
    print(f"{time.asctime()}所有数据插入完毕")
    close_conn(conn, cursor)
原文地址:https://www.cnblogs.com/fengchuiguobanxia/p/14725084.html