爬取编程常用单词

爬取编程常用的英文单词

网站分析:

  通过抓包工具进行分析,页面并非为动态加载;

代码编写:

  

#author:ziqingbajin
# title:爬取扇贝网编程需背单词

import requests
from time import sleep
from lxml import etree
from SQL import Sql


url='https://www.shanbay.com/wordlist/104899/202159/?page=1'
# https://www.shanbay.com/wordlist/104899/202159/?page=2
# 分析url我们可以得知page的变化在改变不同的页面

# 访问url
def get_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
    }
    response=requests.get(url=url,headers=headers).text
    return response

# 数据解析
def analysis(response):
    tree=etree.HTML(response)
    tr_list=tree.xpath('/html/body/div[3]/div/div[1]/div[2]/div/table/tbody/tr')
    word_list=[]#定义储存数据的列表
    for tr in tr_list:
        # 将每条单词及汉语存为一条记录
        record=[]
        english_word=tr.xpath('./td[1]//text()')[0]#英语单词
        chinese_word=tr.xpath('./td[2]/text()')[0]#汉语翻译
        record.append(english_word)
        record.append(chinese_word)
        word_list.append(record)
    return word_list
# 数据存储
def save_data(word_list):
    #接收过来传送的数据
    for word_record in word_list:
        english_word=word_record[0]
        chinese_word=word_record[1]
        with open('shanbei.txt','a',encoding='utf-8')as fp:
            fp.write(english_word)
            fp.write('\t')
            fp.write(chinese_word)
            fp.write('\n')
# 保存到数据库中
def save_sql(word_list):
  # 此处使用的是自己封装的数据库操作的类。读者可根据 pymssql进行数据库的建立,本案例中创建的数据表是id,English,Chinese三个字段。 conn
=Sql('PC') for word_record in word_list: english_word=word_record[0] chinese_word=word_record[1] sqlstr="insert into shanbay(English,Chinese)values('%s','%s')"%(english_word,chinese_word)
     #进行异常检测
try: conn.insert(sqlstr) except Exception as e: print(e) if __name__ == '__main__': #构造通用的url for page in range(1,10): print('第%s页数据爬取中······'%(page)) url = 'https://www.shanbay.com/wordlist/104899/202159/?page=%s'%(page) response=get_url(url) word_list=analysis(response) save_data(word_list) save_sql(word_list) print('下载完成!!!')

实现效果:

  运行时:

  存储:

谢谢观看!

  

原文地址:https://www.cnblogs.com/Blogwj123/p/13905839.html