Python 自用代码(知网会议论文网页源代码清洗)

#coding=utf-8
from pymongo import MongoClient
from lxml import etree
import requests

jigou = u"
      【机构】
      "
zuozhe = u"
        【作者】
          "

# 获取数据库
def get_db():
    client = MongoClient('localhost', 27017)
    db = client.cnki
    db.authenticate("用户名","密码") 
    return db

# 获取第num条数据
def get_data(table, num):
    i = 1
    for item in table.find({}, {"html":1,"_id":0}):
        if i==num:
            if item.has_key('html') and item['html']:
                return item['html']
        else:
            i+=1
            continue

# 列表首元素转字符串
def list_str(list):
    if len(list)!=0:
        return list[0]
    else:
        return ""

# 作者英文名,机构英文名
def en_ls(list, length1, length2):
    if len(list)!=0:
        list = list[0].replace(u"【Author】","").replace("
","").strip().split(";")
        if len(list)==(length2+length1)+1:
            return list2str(list[:length1]), list2str(list[length1:-1])
        else:
            return "", ""
    else:
        return "", ""

def hyxx(list):
    if len(list)!=0:
        hylmc,hymc,hysj,hydd,flh,zbdw = "","","","",[],""
        for item in list:
            if u"【会议录名称】" in item:
                hylmc = item.replace(u"【会议录名称】","").replace("
","").strip()
                continue
            if u"【会议名称】" in item:
                hymc = item.replace(u"【会议名称】","").replace("
","").strip()
                continue
            if u"【会议时间】" in item:
                hysj = item.replace(u"【会议时间】","").replace("
","").strip()
                continue
            if u"【会议地点】" in item:
                hydd = item.replace(u"【会议地点】","").replace("
","").strip()
                continue
            if u"【分类号】" in item:
                flh = item.replace(u"【分类号】","").replace("
","").strip()
                continue
            if u"【主办单位】" in item:
                zbdw = item.replace(u"【主办单位】","").replace(u"",";").replace("
","").strip()
                continue
        return hylmc,hymc,hysj,hydd,flh,zbdw
    else:
        return "","","","","",""

# 列表转字符串
def list2str(list):
    if len(list)!=0:
        return ";".join(list)
    else:
        return ""    

# 构造论文入库字典
def standard_dict(html):
    dc = {}
    print 1
    # print html
    tree = etree.HTML(html)
    # 论文名称
    dc["title"] = list_str(tree.xpath("//span[@id='chTitle']/text()"))
    # 外文名称
    dc["title_eng"] = list_str(tree.xpath("//span[@id='enTitle']/text()"))
    # 作者
    dc["author"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
    # 作者数量
    length1 = len(tree.xpath("//p[text()='%s']/a/text()"%zuozhe))
    # 机构名称
    dc["organization"] = list2str(tree.xpath("//p[text()='%s']/a/text()"%jigou))
    # 机构数量
    length2 = len(tree.xpath("//p[text()='%s']/a/text()"%jigou))
    # 作者英文名, 机构英文名
    dc["author_eng"], dc["organization_eng"] = en_ls(tree.xpath("//p[@id='au_en']/text()"), length1, length2)
    # 摘要
    dc["summary"] = list_str(tree.xpath("//span[@id='ChDivSummary']/text()"))
    # 英文摘要
    dc["summary_eng"] = list_str(tree.xpath("//span[@id='EnChDivSummary']/text()"))
    # 关键词
    dc["keywords"] = list2str(tree.xpath("//div[@class='keywords']/span[1]/a/text()"))
    # 英文关键词
    dc["keywords_eng"] = list2str(tree.xpath("//div[@class='keywords']/span[2]/a/text()"))
    # 会议信息
    dc["proceeding_title"],dc["conference_title"],dc["conference_date"],dc["conference_place"],dc["huiyflh"],dc["conference_org"] = hyxx(tree.xpath("//div[@class='summary']/ul/li/text()"))
    if dc["proceeding_title"]=="":
        print 2
        dc["proceeding_title"] = list_str(tree.xpath("//div[@class='summary']/ul[1]/li/a/text()"))
    
    return dc

# 主函数
def main():
    db = get_db()
    collection=db.conference
    collection2 = db.conference_cleaned
    for item in collection.find({}, {"html":1,"_id":0}):
        if item.has_key('html') and item['html']:
            dc = standard_dict(item['html'])
            collection2.insert(dc)


if __name__ == '__main__':
    main()
    # 以下代码用于测试清洗特定一条数据
    # db = get_db()
    # collection=db.conference
    # data = get_data(collection, 1)
    # dc = standard_dict(data)
    # for k,v in dc.items():
    #     print k,v
原文地址:https://www.cnblogs.com/zhangtianyuan/p/7192845.html