爬取豆瓣读书/文件存储数据/数据库存储数据

requests+beautifulsoup爬取

import requests
from bs4 import BeautifulSoup
import json
#构造url
#通过url发送请求
#返回结果处理
#写入文件

headers={
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0',
            'proxy':"http://60.13.42.109:9999",
                 }

def url_create(start_page,end_page):
    for page in range(start_page,end_page+1):
        url = 'https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start='+str(20*(page-1))
        get_response(url)

def get_response(url):
    print(url)
    response = requests.get(url,headers=headers).text
    check_response(response)

def check_response(response):
    soup = BeautifulSoup(response,'lxml')
    result = soup.find_all(class_='subject-item')
    for item in result:
        name = item.find(class_='info').find(name='a').get_text().split()[0]
        author = item.find(class_='info').find(class_='pub').get_text().split()[-1]
        score = item.find(class_='info').find(class_='rating_nums').get_text().split()[0]
        list = {
            '书名:':name,
            '价格:':author,
            '评分:':score
        }
        write_file(list)


def write_file(list):
    with open('result1.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(list, ensure_ascii=False) + '
')

if __name__ == '__main__':
    start_page = int(input("爬取开始页面:"))
    end_page = int(input("爬取结束页面:"))
    url_create(start_page, end_page)

urllib+xpath爬取
import urllib.request
from lxml import etree
import json

response = urllib.request.urlopen('https://book.douban.com/tag/%E5%B0%8F%E8%AF%B4?start=0')
html = response.read().decode('utf-8')

result = etree.HTML(html)

name_list = "".join(result.xpath('//ul[@class="subject-list"]//h2//a/text()')).split()
author_list = "".join(result.xpath('//ul[@class="subject-list"]//div[@class="pub"]/text()')).split('
')
score_list = result.xpath('//ul[@class="subject-list"]//span[@class="rating_nums"]/text()')
list = []
k=3
for i in range(len(author_list)//4):
    try:
        list.append(author_list[k])
        k+=5
    except:
        pass
for i in range(len(name_list)):

        dict = {
            '书名:':name_list[i],
            '作者:': list[i],
            '评分:': score_list[i]
        }
        with open('result2.txt','a',encoding='utf-8') as f:
            f.write(json.dumps(dict,ensure_ascii=False) + '
')

 TXT文本的存储/读取

id= '10001'
name = 'Bob'
age = '22'

with open('data.txt','w') as f:
    f.write(' '.join([id,name,age]))
with open('data.txt','r') as f:
    data = f.readline()
    print(data)

JSON文件存储/读取

CSV文件存储/读取

方法一


import csv

with open('data.csv','w',newline="") as csvfile:
writer = csv.writer(csvfile,delimiter=' ') #delimiter参数表示用空格隔开,没有此参数默认用逗号隔开id和name和age,每一行都是
writer.writerow(['id','name','age'])
writer.writerow(['10001', 'mike', '22'])
writer.writerow(['10002', 'bob', '25'])
writer.writerow(['10003','jodan','24'])

 图一

方法二

import csv

with open('data.csv','w',newline="") as csvfile:
    writer = csv.writer(csvfile,delimiter=' ')
    writer.writerow(['id','name','age'])
    writer.writerows([['10001', 'mike', '22'],['10002', 'bob', '25'],['10003','jodan','24']])            #结果看图一

方法三

import csv

with open('data.csv','w',newline="") as csvfile:                                   #encoding=“utf-8”   增加这个参数写入中文不会乱码    增加一个newline=“”就不会像图一空一行打印
    fieldnames = ['id','name','age']
    writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
    writer.writeheader()
    writer.writerow({'id':'10001','name':'mike','age':22})
    writer.writerow({'id':'10002', 'name':'bob', 'age':25})
    writer.writerow({'id':'10003', 'name':'jodan', 'age':24})                       #结果看图一

读取

方法一

with open('data.csv','r',encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        print(row)

方法二

import pandas as pd

df = pd.read_csv('data.csv')
print(df)

 

原文地址:https://www.cnblogs.com/yzcstart/p/10892724.html