第三次作业

作业①

(1):指定一个网站,爬取这个网站中的所有的所有图片,例如中国气象网(http://www.weather.com.cn)。分别使用单线程和多线程的方式爬取。

单线程爬取

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/14
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
def imageSpider(start_url):
    try:
        urls=[]
        req=urllib.request.Request(start_url,headers=headers)
        data=urllib.request.urlopen(req)
        data=data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        images = soup.select("img")
        for image in images:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:

                    urls.append(url)
                    print(url)
                    download(url)
            except Exception as err: print(err)
    except Exception as err:
        print(err)

def download(url):
    global count
    try:
        count=count+1
        # 提取文件后缀扩展名
        if (url[len(url) - 4] == "."):
            ext = url[len(url) - 4:]
        else:
            ext = ""
        req = urllib.request.Request(url,headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("images\" + str(count) + ext, "wb")
        fobj.write(data)
        fobj.close()
        print("downloaded " + str(count) + ext)
    except Exception as err:
        print(err)

start_url="http://www.weather.com.cn/weather/101280601.shtml"
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count=0
imageSpider(start_url)


多线程爬取

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/15
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import threading
import time

def imageSpider(start_url):
    global threads
    global count
    try:
        urls = []
        req = urllib.request.Request(start_url, headers=headers)
        data = urllib.request.urlopen(req)
        data = data.read()
        dammit = UnicodeDammit(data, ["utf-8", "gbk"])
        data = dammit.unicode_markup
        soup = BeautifulSoup(data, "lxml")
        images1 = soup.select("img")
        for image in images1:
            try:
                src = image["src"]
                url = urllib.request.urljoin(start_url, src)
                if url not in urls:
                    urls.append(url)
                    print(url)
                    count = count + 1
                    T = threading.Thread(target=download, args=(url, count))
                    T.setDaemon(False)
                    T.start()
                    threads.append(T)
            except Exception as err:
                print(err)
    except Exception as err:
        print(err)


def download(url, count):
    try:
        if (url[len(url)-4] == "."):
            ext = url[len(url)-4:]
        else:
            ext = ""
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req, timeout=100)
        data = data.read()
        fobj = open("images\" + str(count) + ext, "wb")
        fobj.write(data)
        fobj.close()
        print("downloaded " + str(count) + ext)
    except Exception as err:
        print(err)


print("More Threads Craw JPG Images")
start_url = "http://www.weather.com.cn/weather/101280601.shtml"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US;rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
count = 0
threads = []

time_start = time.time()

imageSpider(start_url)
for t in threads:
    t.join()
print("the End")
time_end = time.time()
time_using = time_end - time_start
print("More Threads Craw JPG Images Time Using:", time_using, 's')


(2):心得体会

尝试着实践了多线程的代码,中间也出现一些小问题,函数和参数的使用,thread.Thread的使用也查了一下,过程相对顺利

作业②

(1):使用scrapy框架复现作业①

思路流程:

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/16
import scrapy
from ..items import WeatherPhotoItem
from scrapy.selector import Selector
class Spider_weatherphoto(scrapy.Spider):

    name = "spiderweatherphoto"#给定爬虫的名字
    start_urls=["http://www.weather.com.cn/"]
#执行爬虫的方法
    def parse(self, response):

        try:

            data = response.body.decode()
            selector = Selector(text=data)
            s=selector.xpath("//img/@src").extract()#获得src元素的Selector对象对应的src元素的文本组成的列表,extract()获取属性值
            for e in s:#循环提取
                item=WeatherPhotoItem()
                item["photo"] = [e]#photo要提取的数据项目,在items.py中
                yield item#返回一个值等待被取走
        except Exception as err:
            print(err)


         # print(response.url)
        # item = WeatherPhotoItem()
        #  data = response.body.decode()
         # print(data)
         # selector=Selector(text=data)
         # s=selector.xpath("//img/@src")
         #
         # print(s)
         # print(s.extract())
         # for e in s:
         #     print(e.extract())


        # lis = response.xpath("//div/a/img")
        # # print(lis)
        # for li in lis:
        #     src = li.xpath("@src")
        #     print(src)
        # item["image"] = response
        # data=response.body.decode()
        # print(data)
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class WeatherPhotoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    photo = scrapy.Field()#要提取photo
    # pass

ITEM_PIPELINES = {
    #'weather_photo.pipelines.WeatherPhotoPipeline': 300,
    'scrapy.pipelines.images.ImagesPipeline':1#然后我们在添加这行代码就可以使用他scrapy内置的图片下载器
}
IMAGES_STORE=r'D:anacondaexampledata_acquisitiondown_images'
IMAGES_URLS_FIELD='photo'
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/16
from scrapy import cmdline
cmdline.execute("scrapy crawl spiderweatherphoto -s LOG_ENABLED=False".split())

(2):心得体会

初次使用scrapy框架确实有些不好弄,不过xpath确实很好用,全文档搜索标签很方便,可以在settings里面用scrapy
内置的图片下载器,不用自己在编一个download有点方便,items里面写入自己想要获取的数据名称

作业③

(1):使用scrapy框架爬取股票相关信息。

东方财富网:https://www.eastmoney.com/ 新浪股票:http://finance.sina.com.cn/stock/
思路流程:

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/17
import scrapy
import json
import re
from ..items import GupiaodataItem
class spider_gupiao(scrapy.Spider):

    name = "spidergupiao"
    # for j in range(1,10):
    
    
    start_urls=["http://75.push2.eastmoney.com/api/qt/clist/get?&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1602901412583%20Request%20Method:%20GET"]

    def parse(self, response):
            try:
               sites = json.loads(response.body_as_unicode())
               data = sites["data"]
               diff = data["diff"]
        # pat =re.compile("[{.*?}]")
        # data3 = pat.
        # print(sites)
             
               for i in range(len(diff)):
                    item=GupiaodataItem()
                    item["mount"]=str(i)
                    item["code"]=str(diff[i]["f12"])
                    item["name"]=str(diff[i]["f14"])
                    item["lately"]=str(diff[i]["f2"])
                    item["zhangdiefu"]=str(diff[i]["f3"])
                    item["zhangdiee"]=str(diff[i]["f4"])
                    item["chengjiaoliang"]=str(diff[i]["f5"])
                    item["chengjiaoe"]=str(diff[i]["f6"])
                    item["zhenfu"]=str(diff[i]["f7"])
                    item["zuigao"]=str(diff[i]["f15"])
                    item["zuidi"]=str(diff[i]["f16"])
                    item["jinkai"]=str(diff[i]["f17"])
                    item["zuoshou"]=str(diff[i]["f18"])
                    yield item
            except Exception as err:
                print(err)
        # song_item=response.meta["item"]
        # js = json.loads(response.text)["data"]
        # song_item["data"] = data
        # print(data)

# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class GupiaodataItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    mount = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    lately = scrapy.Field()
    zhangdiefu = scrapy.Field()
    zhangdiee = scrapy.Field()
    chengjiaoliang = scrapy.Field()
    chengjiaoe = scrapy.Field()
    zhenfu = scrapy.Field()
    zuigao = scrapy.Field()
    zuidi = scrapy.Field()
    jinkai = scrapy.Field()
    zuoshou = scrapy.Field()
    # pass

# useful for handling different item types with a single interface
from itemadapter import ItemAdapter

class GupiaodataPipeline:
    count = 0
    def process_item(self, item, spider):
        GupiaodataPipeline.count+=1
        #控制输出格式对齐
        tplt = "{0:^2}	{1:^1}	{2:{13}^4}	{3:^5}	{4:^6}	{5:^6}	{6:^6}	{7:^10}	{8:^10}	{9:^10}	{10:^10}	{11:^10}	{12:^10}"
        try:
            if GupiaodataPipeline.count==1:#count==1时,即第一次调用时新建一个txt文件,然后把item数据写到文件中
                fobj=open("data.txt","wt")#写入data.txt中
                fobj.write("序号" + "        股票代码" + "    股票名称  " + " 最新报价  " + " 涨跌幅  " + "  涨跌额  " +
                           "      成交量   " + "       成交额         " + "            振幅   " + "      最高   " + "      最低   " + "      今开    "+ "       昨收  " + "
")
            else:#如果不是第一次调用count>1就打开已经存在的文件,把item的数据追加到文件中
                fobj=open("data.txt","at")
            # fobj.write("序号"+"  股票代码  "+"  股票名称  "+"  最新报价  "+"涨跌幅"+"涨跌额"+
            #            "成交量"+"成交额"+"振幅"+"最高"+"最低"+"今开"+"昨收"+"
")
            fobj.write(
                tplt.format(item["mount"], item["code"], item["name"], item['lately'], item['zhangdiefu'],
                            item['zhangdiee'], item['chengjiaoliang'],item['chengjiaoe'],item['zhenfu'],
                            item['zuigao'],item['zuidi'],item['jinkai'],item['zuoshou'],chr(12288)))
            fobj.write("
")
            
            fobj.close()
        except Exception as err:
            print(err)
        return item
ITEM_PIPELINES = {
   'gupiaodata.pipelines.GupiaodataPipeline': 300,
}
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# author: xm time:2020/10/17
from scrapy import cmdline
cmdline.execute("scrapy crawl spidergupiao -s LOG_ENABLED=False".split())

部分截图:

(2):心得体会

和实验二的大体相似,但是因为获取的是json格式的数据,没办法使用xpath,换了json.loads(),用js[i]["f12"]获取数据值,
字典形式的也很好。接着就是存储的时候的格式问题,如果直接拼接,就要str(),但是很难对齐,所以还是用format,中文用
chr(12288),用管道类的进行数据的写入存储,过程稍有困难

原文地址:https://www.cnblogs.com/lmmlm/p/13837070.html