第五次作业

作业①:

要求：
- 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架爬取京东商城某类商品信息及图片。
候选网站：http://www.jd.com/
关键词：学生自由选择
输出信息：MYSQL的输出信息如下

mNo mMark mPrice mNote mFile

000001 三星Galaxy 9199.00 三星Galaxy Note20 Ultra 5G... 000001.jpg

000002......

mNo	mMark	mPrice	mNote	mFile
000001	三星Galaxy	9199.00	三星Galaxy Note20 Ultra 5G...	000001.jpg
000002......

代码：

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time


class MySpider:
    headers = {
        "User-Agent": "SMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/86.0.4240.183 Safari/537.361 "
        }

    # 保存图片的文件夹
    imagePath = "download"

    def startUp(self, url, key):
        # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        # Initializing variables
        self.threads = []
        self.No = 0
        self.imgNo = 0
        # Initializing database
        try:
            self.con = sqlite3.connect("phones.db")
            self.cursor = self.con.cursor()
            try:
                # 如果有表就删除
                self.cursor.execute("drop table phones")
            except:
                pass
            try:
                #  建立新的表
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
            except:
                pass

        except Exception as err:
            print(err)
            # Initializing images folder
        try:
            if not os.path.exists(MySpider.imagePath):
                os.mkdir(MySpider.imagePath)
            images = os.listdir(MySpider.imagePath)
            for img in images:
                s = os.path.join(MySpider.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)
        self.driver.get(url)
        keyInput = self.driver.find_element_by_id("key")
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:

            self.con.commit()
            self.con.close()
            self.driver.close()

        except Exception as err:
            print(err)

    def insertDB(self, mNo, mMark, mPrice, mNote, mFile):
        try:
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (?,?,?,?,?)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
        except Exception as err:
            print(err)

    def showDB(self):
        try:
            con = sqlite3.connect("phones.db")
            cursor = con.cursor()
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")

            rows = cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))

            con.close()
        except Exception as err:
            print(err)

    def download(self, src1, src2, mFile):
        data = None
        if src1:
            try:
                req = urllib.request.Request(src1, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if not data and src2:
            try:
                req = urllib.request.Request(src2, headers=MySpider.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(MySpider.imagePath + "\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            for li in lis:
                # We find that the image is either in src or in data-lazy-img attribute
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//a//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("爱心东东
", "")
                    mark = mark.replace(",", "")
                    note = note.replace("爱心东东
", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""
                    src2 = ""
                self.No = self.No + 1
                no = str(self.No)
                while len(no) < 6:
                    no = "0" + no
                    print(no, mark, price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
            try:
                self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-prev disabled']")
            except:
                nextPage = self.driver.find_elements_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
                time.sleep(10)
                nextPage.click()
                self.processSpider()
        except Exception as err:
            print(err)

    def executeSpider(self, url, key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")


url = "https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&wq=%E6%89%8B%E6%9C%BA&pvid" 
      "=3b49f8ac7fda4e90be468e2e23de7bfc "
spider = MySpider()
while True:
    print("1.爬取")
    print("2.显示")
    print("3.退出")
    s = input("请选择(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "手机")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

实验结果：
实验心得：

实验一主要就是跟着老师给的代码打，Seleiumn框架的用法比较复杂，刚开始理解起来有点困难，但是在打代码的过程中看看老师给的ppt，也能收获比较多的东西。Seleiumn框架的自动化爬虫让我感受到了爬虫的乐趣，需要注意的就是要合理使用time.sleep()函数，不然可能爬不出来数据。

作业②

要求：
- 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站：东方财富网：http://quote.eastmoney.com/center/gridlist.html#hs_a_board

输出信息：MYSQL数据库存储和输出格式如下，表头应是英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计表头：

序号	股票代码	股票名称	最新报价	涨跌幅	涨跌额	成交量	成交额	振幅	最高	最低	今开	昨收
1	688093	N世华	28.47	62.22%	10.92	26.13万	7.6亿	22.34	32.0	28.08	30.2	17.55
2......

代码：

import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time


class stocks:
    headers = {
        "User-Agent": "SMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/86.0.4240.183 Safari/537.361 "
    }

    def startUp(self, url):
        # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='523523', db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from stocks")
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False

        # 获取url链接
        self.driver.get(url)

    def closeUp(self):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
            self.driver.close()
        print("closed")

    def processSpider(self):
        time.sleep(1)
        try:
            trs = self.driver.find_elements_by_xpath(
                "//div[@class='listview full']/table[@id='table_wrapper-table']/tbody/tr")
            time.sleep(1)
            for tr in trs:
                time.sleep(1)
                id = tr.find_element_by_xpath(".//td[position()=1]").text  # 序号
                code = tr.find_element_by_xpath(".//td[position()=2]/a").text  # 代码
                name = tr.find_element_by_xpath(".//td[position()=3]/a").text  # 名称
                zxj = tr.find_element_by_xpath(".//td[position()=5]/span").text  # 最新价
                zdf = tr.find_element_by_xpath(".//td[position()=6]/span").text  # 涨跌幅
                zde = tr.find_element_by_xpath(".//td[position()=7]/span").text  # 涨跌额
                cjl = tr.find_element_by_xpath(".//td[position()=8]").text  # 成交量
                time.sleep(1)
                cje = tr.find_element_by_xpath(".//td[position()=9]").text  # 成交额
                zf = tr.find_element_by_xpath(".//td[position()=10]").text  # 振幅
                zg = tr.find_element_by_xpath(".//td[position()=11]/span").text  # 最高
                zd = tr.find_element_by_xpath(".//td[position()=12]/span").text  # 最低
                jk = tr.find_element_by_xpath(".//td[position()=13]/span").text  # 今开
                zs = tr.find_element_by_xpath(".//td[position()=14]").text  # 昨收
                time.sleep(1)
                # 打印爬取结果
                print(id, code, name, zxj, zdf, zde, cjl, cje, zf,zg, zd, jk, zs)
                # 将结果存入数据库
                if self.opened:
                    self.cursor.execute(
                        "insert into stocks(wId, wCode, wName, wZxj, wZdf, wZde, wCjl, wCje, wZf, wZg, wZd, wJk, wZs) "
                        "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                        (str(id), str(code), str(name), str(zxj), str(zdf), str(zde), str(cjl), str(cje), str(zf), str(zg), str(zd), str(jk), str(zs)))
        except Exception as err:
            print(err)

    def executeSpider(self, url):
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()


url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
spider = stocks()
spider.executeSpider(url)

实验结果：
实验心得：

实验二就是在实验一的基础上修改一下爬虫语句，然后就是建立与mysql的连接，将爬取的数据存入数据表。

作业③:

要求：
- 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架+MySQL爬取中国mooc网课程资源信息（课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介）
候选网站：中国mooc网：https://www.icourse163.org

输出信息：MYSQL数据库存储和输出格式

Id	cCourse	cCollege	cTeacher	cTeam	cCount	cProcess	cBrief
1	Python数据分析与展示	北京理工大学	嵩天	嵩天	470	2020年11月17日 ~ 2020年12月29日	“我们正步入一个数据或许比软件更重要的新时代。——Tim O'Reilly” 运用数据是精准刻画事物、呈现发展规律的主要手段，分析数据展示规律，把思想变得更精细！ ——“弹指之间·享受创新”，通过8周学习，你将掌握利用Python语言表示、清洗、统计和展示数据的能力。
2......

代码：

import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time


class mooc:
    headers = {
        "User-Agent": "SMozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/86.0.4240.183 Safari/537.361 "
    }

    def startUp(self, url):
        # Initializing Chrome browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(options=chrome_options)
        try:
            self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd='523523', db="mydb",
                                       charset="utf8")
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            self.cursor.execute("delete from mooc")
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False

        # 获取url链接
        self.driver.get(url)

        # 计数
        self.count = 1

    def closeUp(self):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
            self.driver.close()
        print("closed")

    def processSpider(self):
        time.sleep(1)
        try:
            lis = self.driver.find_elements_by_xpath("//div[@class='m-course-list']/div/div[@class]")
            time.sleep(1)
            for li in lis:
                course = li.find_element_by_xpath(".//div[@class='t1 f-f0 f-cb first-row']").text
                college = li.find_element_by_xpath(".//a[@class='t21 f-fc9']").text
                teacher = li.find_element_by_xpath(".//div[@class='t2 f-fc3 f-nowrp f-f0']/a[position()=2]").text
                team = li.find_element_by_xpath(".//div[@class='t2 f-fc3 f-nowrp f-f0']").text
                count = li.find_element_by_xpath(".//div[@class='t2 f-fc3 f-nowrp f-f0 margin-top0']/span[@class='hot']").text
                process = li.find_element_by_xpath(".//span[@class='txt']").text
                brief = li.find_element_by_xpath(".//span[@class='p5 brief f-ib f-f0 f-cb']").text
                time.sleep(1)
                # 打印爬取结果
                print(self.count, course, college, teacher, team, count, process, brief)
                # 将结果存入数据表
                if self.opened:
                    self.cursor.execute(
                        "insert into mooc(wId, wCourse, wCollege, wTeacher, wTeam, wCount, wProcess, wBrief)"
                        "values(%s, %s, %s, %s, %s, %s, %s, %s)",
                        (str(self.count), course, college, teacher, team, str(count), process, brief))
                    self.count += 1
        except Exception as err:
            print(err)

    def executeSpider(self, url):
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()


url = 'https://www.icourse163.org/search.htm?search=%E9%9F%B3%E4%B9%90#/'
spider = mooc()
spider.executeSpider(url)

实验结果：
实验心得：

同样也是在实验一的基础上修改一些语句。